# RoBERTa Training â€“ Class Weighted Approach

This notebook contains:
* Data preprocessing
* Class imbalance handling
* Model training
* Evaluation metrics
* Confusion matrix

In [None]:
!pip install -q transformers datasets accelerate scikit-learn evaluate

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os
import joblib

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset
from google.colab import files

print("GPU available:", torch.cuda.is_available())
print("CUDA device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


df = pd.read_csv("clean_amazon_reviews.csv")
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

df = df[['clean_review', 'sentiment']].dropna().reset_index(drop=True)

print(f"\nCleaned dataset shape: {df.shape}")
print("\nOriginal sentiment distribution:")
sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)
print(f"Class distribution percentages:")
for label, count in sentiment_counts.items():
    print(f"  {label}: {count} ({count/len(df)*100:.1f}%)")


le = LabelEncoder()
df["label"] = le.fit_transform(df["sentiment"])

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"\nLabel mapping: {label_mapping}")

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df["label"]),
    y=df["label"]
)

class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print(f"\nCalculated class weights: {class_weight_dict}")

class_weights_tensor = torch.FloatTensor(class_weights)


train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["clean_review"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

print(f"\nTrain samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

train_dist = pd.Series(train_labels).value_counts().sort_index()
val_dist = pd.Series(val_labels).value_counts().sort_index()
print(f"\nTrain distribution: {train_dist.tolist()}")
print(f"Validation distribution: {val_dist.tolist()}")


model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(texts, max_length=320):
    return tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

print("Tokenizing datasets...")
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_labels
})

val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": val_labels
})

print("âœ… Datasets created successfully!")


class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.model.device)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')

        loss_fn = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fn(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(le.classes_),
    problem_type="single_label_classification"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
class_weights_tensor = class_weights_tensor.to(device)

print(f"âœ… Model loaded on: {device}")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
    }

training_args = TrainingArguments(
    output_dir="./sentiment_results_weighted",

    eval_strategy="epoch",
    save_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    learning_rate=2e-5,
    num_train_epochs=6,

    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,

    gradient_accumulation_steps=1,
    weight_decay=0.01,
    warmup_ratio=0.1,

    fp16=True,
    dataloader_num_workers=2,

    logging_dir="./logs",
    logging_steps=50,

    save_total_limit=2,
    report_to="none",
    seed=42,

    remove_unused_columns=False,
    push_to_hub=False,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


trainer = WeightedTrainer(
    class_weights=class_weights_tensor,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("ðŸš€ Starting training with class weights...")
trainer.train()


print("\nðŸ“Š Evaluating model...")
eval_results = trainer.evaluate()
print("Final evaluation metrics:")
for key, value in eval_results.items():
    if key.startswith('eval_'):
        print(f"  {key}: {value:.4f}")

predictions = trainer.predict(val_dataset)
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=1)

y_true = le.inverse_transform(labels)
y_pred = le.inverse_transform(preds)

print("\nðŸ“‹ Detailed Classification Report:")
print(classification_report(y_true, y_pred, target_names=le.classes_))

cm = confusion_matrix(y_true, y_pred, labels=le.classes_)
df_cm = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)

plt.figure(figsize=(8, 6))
sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues", cbar_kws={'label': 'Count'})
plt.title("Confusion Matrix - Class Weighted Model")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.tight_layout()
plt.show()


print("\nðŸ’¾ Saving model...")
model_save_path = "roberta_sentiment_weighted"
os.makedirs(model_save_path, exist_ok=True)

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
joblib.dump(le, f"{model_save_path}/label_encoder.pkl")

metadata = {
    'class_weights': class_weight_dict,
    'label_mapping': label_mapping,
    'model_name': model_name,
    'num_classes': len(le.classes_),
    'classes': le.classes_.tolist()
}
joblib.dump(metadata, f"{model_save_path}/model_metadata.pkl")

print("âœ… Model saved successfully!")

!zip -r roberta_sentiment_weighted.zip roberta_sentiment_weighted

print("ðŸ“¥ Downloading model...")
files.download("roberta_sentiment_weighted.zip")


class SentimentPredictor:
    def __init__(self, model_path="roberta_sentiment_weighted"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.label_encoder = joblib.load(f"{model_path}/label_encoder.pkl")
        self.metadata = joblib.load(f"{model_path}/model_metadata.pkl")

        self.model.to(self.device)
        self.model.eval()

        print(f"âœ… Sentiment predictor loaded on: {self.device}")

    def predict(self, texts, batch_size=32):
        if isinstance(texts, str):
            texts = [texts]

        results = []

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]

            encodings = self.tokenizer(
                batch_texts,
                truncation=True,
                padding=True,
                max_length=256,
                return_tensors="pt"
            )

            encodings = {k: v.to(self.device) for k, v in encodings.items()}

            with torch.no_grad():
                outputs = self.model(**encodings)
                logits = outputs.logits
                probs = torch.softmax(logits, dim=1).cpu().numpy()

            preds = np.argmax(probs, axis=1)
            labels = self.label_encoder.inverse_transform(preds)

            for j, text in enumerate(batch_texts):
                prob_dict = {
                    label: float(probs[j][idx])
                    for idx, label in enumerate(self.label_encoder.classes_)
                }

                results.append({
                    "text": text,
                    "predicted_label": labels[j],
                    "confidence": float(np.max(probs[j])),
                    "probabilities": prob_dict
                })

        return results[0] if len(texts) == 1 else results

predictor = SentimentPredictor()


print("\nðŸ§ª Testing the trained model:")

test_cases = [
    "This product is amazing and works perfectly!",
    "Worst purchase ever, totally disappointed",
    "Quality is okay, nothing special",
    "The quality was bad but delivery was okay",
    "Excellent service and fast shipping",
    "Not worth the money, poor quality"
]

print("\nSingle predictions:")
for text in test_cases:
    result = predictor.predict(text)
    print(f"Text: '{text}'")
    print(f"Prediction: {result['predicted_label']} (confidence: {result['confidence']:.3f})")
    print(f"Probabilities: {result['probabilities']}")
    print("-" * 50)

print("\nBatch prediction:")
batch_results = predictor.predict(test_cases)
for result in batch_results:
    print(f"{result['predicted_label']}: {result['text']}")

print("\nðŸŽ‰ Complete! Your optimized class-weighted sentiment model is ready!")