<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/NLP_application.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

In [None]:
pip install flask pandas sklearn torch prometheus_client evidently lime shap redis kafka-python mlflow

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from flask import Flask, request, jsonify
import numpy as np
import redis
from prometheus_client import start_http_server, Counter
import json
from kafka import KafkaProducer
import mlflow

# Load and prepare dataset (example assumes a DataFrame with 'text' and 'label' columns)
data = pd.read_csv("dataset.csv")  # Replace with your file
train_texts, val_texts, train_labels, val_labels = train_test_split(data['text'], data['label'], test_size=0.2)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # Adjust num_labels

# Tokenize data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

# Convert data to PyTorch format
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, list(train_labels))
val_dataset = Dataset(val_encodings, list(val_labels))

# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    precision = precision_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

# Add compute_metrics to the Trainer
trainer.compute_metrics = compute_metrics

# Re-run evaluation
eval_results = trainer.evaluate()
print(eval_results)

# Save model and tokenizer for later use
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

# Initialize Flask app
app = Flask(__name__)

@app.route("/predict", methods=["POST"])
def predict():
    data = request.json
    text = data["text"]

    # Tokenize input text
    encodings = tokenizer([text], truncation=True, padding=True, max_length=128, return_tensors="pt")

    # Make predictions
    outputs = model(**encodings)
    prediction = torch.argmax(outputs.logits, dim=1).item()

    # Log prediction and feedback if provided
    feedback = data.get("feedback")

    if feedback:
        with open("feedback_log.csv", "a") as log_file:
            log_file.write(f"{text},{prediction},{feedback}\n")

    return jsonify({"prediction": prediction})

# Start Prometheus server for monitoring metrics
start_http_server(8000)
inference_requests = Counter("inference_requests_total", "Total inference requests")

@app.route("/predict_with_counter", methods=["POST"])
def predict_with_counter():
    inference_requests.inc()  # Increment counter for each request
    return predict()

# Kafka producer setup for feedback logging
producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

@app.route("/feedback", methods=["POST"])
def feedback():
    data = request.json
    text = data["text"]

    # Send feedback data to Kafka topic
    producer.send("feedback_topic", {"text": text})

    return jsonify({"status": "Feedback sent"})

if __name__ == "__main__":
    app.run(debug=True)

# Redis cache setup for caching predictions
cache = redis.Redis(host='localhost', port=6379)

def get_prediction(input_data):
    if cache.exists(input_data):
        return cache.get(input_data)

    encodings = tokenizer([input_data], truncation=True, padding=True, max_length=128)

    outputs = model(**encodings)

    prediction = torch.argmax(outputs.logits).item()

    cache.set(input_data, prediction)

    return prediction

# Example of using MLflow for tracking experiments and models
mlflow.start_run()
mlflow.log_param("model_version", "1.0")
mlflow.log_metric("accuracy", eval_results['eval_accuracy'])
mlflow.sklearn.log_model(model, "model")
mlflow.end_run()