In [None]:
import json
import pickle
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import warnings

# Filter out warnings for cleaner output
warnings.filterwarnings("ignore")

# Set global plot style
plt.style.use("default")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["font.size"] = 12

In [None]:
class ModelTrainer:
    """Handles model training and evaluation."""

    def __init__(self, train_df: pd.DataFrame, eval_df: pd.DataFrame):
        self.train_df = train_df
        self.eval_df = eval_df
        self.results = {}

        train_labels = train_df["label"].tolist()
        class_weights = compute_class_weight(
            class_weight="balanced",
            classes=np.unique(train_labels),
            y=train_labels,
        )
        self.class_weights = torch.tensor(class_weights, dtype=torch.float)

    def _prepare_datasets(self, tokenizer) -> Tuple[Dataset, Dataset]:
        """Convert dataframes to tokenized datasets."""

        def tokenize_function(examples):
            return tokenizer(
                examples["cleaned_review"],
                padding="max_length",
                truncation=True,
                max_length=256,
            )

        train_dataset = Dataset.from_pandas(self.train_df[["cleaned_review", "label"]])
        eval_dataset = Dataset.from_pandas(self.eval_df[["cleaned_review", "label"]])

        train_dataset = train_dataset.map(tokenize_function, batched=True)
        eval_dataset = eval_dataset.map(tokenize_function, batched=True)

        train_dataset.set_format(
            type="torch", columns=["input_ids", "attention_mask", "label"]
        )
        eval_dataset.set_format(
            type="torch", columns=["input_ids", "attention_mask", "label"]
        )

        return train_dataset, eval_dataset

    def _create_trainer(
        self,
        model,
        tokenizer,
        train_dataset,
        eval_dataset,
        training_args: TrainingArguments,
    ):
        """Create custom trainer with class weights."""

        class WeightedTrainer(Trainer):
            def __init__(self, class_weights=None, **kwargs):
                super().__init__(**kwargs)
                self.class_weights = (
                    class_weights.to(self.args.device) if class_weights is not None else None
                )

            def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
                labels = inputs.get("labels")
                outputs = model(**inputs)
                logits = outputs.get("logits")

                if self.class_weights is not None:
                    loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
                else:
                    loss_fct = torch.nn.CrossEntropyLoss()

                loss = loss_fct(
                    logits.view(-1, model.config.num_labels), labels.view(-1)
                )
                return (loss, outputs) if return_outputs else loss

        return WeightedTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            class_weights=self.class_weights,
        )

    def train_and_evaluate_model(self, model_name: str, training_args: TrainingArguments) -> Dict:
        """Train and evaluate a single model."""
        print(f"\n{'='*20} {model_name} {'='*20}")

        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels=3,
                id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
                label2id={"Negative": 0, "Neutral": 1, "Positive": 2},
            )

            train_dataset, eval_dataset = self._prepare_datasets(tokenizer)

            trainer = self._create_trainer(
                model,
                tokenizer,
                train_dataset,
                eval_dataset,
                training_args,
            )

            trainer.train()

            # Save the trained model and tokenizer
            save_directory = f"./{model_name.replace('/', '_')}-finetuned"
            print(f"Saving model and tokenizer to {save_directory}")
            trainer.save_model(save_directory)

            predictions = trainer.predict(eval_dataset)
            pred_labels = predictions.predictions.argmax(axis=1)
            true_labels = predictions.label_ids

            f1 = f1_score(true_labels, pred_labels, average="weighted")

            label_names = ["Negative", "Neutral", "Positive"]
            report = classification_report(
                true_labels, pred_labels, target_names=label_names, output_dict=True
            )

            print(f"F1 Score: {f1:.4f}")
            print("\nClassification Report:")
            print(f"{'Class':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10}")
            print("-" * 45)
            for label in label_names:
                metrics = report[label]
                print(
                    f"{label:<10} {metrics['precision']:<10.3f} "
                    f"{metrics['recall']:<10.3f} {metrics['f1-score']:<10.3f}"
                )

            result = {
                "model_name": model_name,
                "f1_score": f1,
                "predictions": pred_labels.tolist(),
                "true_labels": true_labels.tolist(),
                "classification_report": report,
            }

            self.results[model_name] = result

            del model, trainer, tokenizer
            torch.cuda.empty_cache()

            return result

        except Exception as e:
            print(f"Error with {model_name}: {str(e)}")
            return {"model_name": model_name, "error": str(e)}

    def run_all_models(self) -> Dict:
        """Train and evaluate all models."""
        models = [
            "distilbert-base-uncased",
            "bert-base-uncased",
            "roberta-base",
            "cardiffnlp/twitter-roberta-base-sentiment",
        ]

        # Define a default set of hyperparameters
        default_args = TrainingArguments(
            output_dir="./results",
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            eval_strategy="epoch",
            save_strategy="no",
            logging_steps=500,
            fp16=torch.cuda.is_available(),
            report_to="none",
            disable_tqdm=False,
            seed=42,
            dataloader_pin_memory=True,
            gradient_checkpointing=True,
            lr_scheduler_type="linear",
            warmup_steps=50,
        )

        for model_name in models:
            self.train_and_evaluate_model(model_name, default_args)

        with open("model_results.json", "w") as f:
            serializable_results = {}
            for model, result in self.results.items():
                if "error" not in result:
                    serializable_results[model] = {
                        "model_name": result["model_name"],
                        "f1_score": float(result["f1_score"]),
                        "predictions": result["predictions"],
                        "true_labels": result["true_labels"],
                    }

        with open("model_results.json", "w") as f:
            json.dump(serializable_results, f, indent=2)

        with open("detailed_results.pkl", "wb") as f:
            pickle.dump(self.results, f)

        print(f"\n{'='*60}")
        print("SUMMARY OF ALL MODELS")
        print(f"{'='*60}")
        print(f"{'Model':<45} {'F1 Score':<10}")
        print("-" * 60)

        for model_name, result in self.results.items():
            if "error" not in result:
                print(f"{model_name:<45} {result['f1_score']:<10.4f}")
            else:
                print(f"{model_name:<45} {'ERROR':<10}")

        return self.results

In [None]:
# Run all models
trainer = ModelTrainer(train_df, eval_df)
all_results = trainer.run_all_models()

In [None]:
class ModelVisualizer:
    """Create comprehensive visualizations comparing sentiment analysis models."""

    def __init__(
        self, results_file="model_results.json", detailed_file="detailed_results.pkl"
    ):
        """Load model results and prepare for visualization."""
        with open(results_file, "r") as f:
            self.results = json.load(f)

        with open(detailed_file, "rb") as f:
            self.detailed_results = pickle.load(f)

        self.label_names = ["Negative", "Neutral", "Positive"]
        self.model_names = list(self.results.keys())

        print(f"Loaded results for {len(self.model_names)} models")

    def _clean_model_name(self, name: str) -> str:
        """Helper to clean model names for display."""
        clean_name = name.replace("cardiffnlp/", "").replace("nlptown/", "")
        clean_name = clean_name.replace("-base-uncased", "").replace("-base", "")
        clean_name = clean_name.replace("twitter-roberta", "Twitter RoBERTa")
        clean_name = clean_name.replace("bert-multilingual", "Multilingual BERT")
        clean_name = clean_name.title().replace("Distilbert", "DistilBERT").replace("Roberta", "RoBERTa")
        return clean_name

    def create_f1_comparison(self):
        """Create F1 score comparison bar chart."""
        fig, ax = plt.subplots(1, 1, figsize=(14, 8))

        models = [self._clean_model_name(m) for m in self.model_names]
        f1_scores = [self.results[m]["f1_score"] for m in self.model_names]

        colors = sns.color_palette("viridis", len(models))
        bars = ax.bar(
            models, f1_scores, color=colors, alpha=0.8, edgecolor="black", linewidth=1
        )

        for bar, score in zip(bars, f1_scores):
            height = bar.get_height()
            ax.text(
                bar.get_x() + bar.get_width() / 2.0,
                height + 0.005,
                f"{score:.3f}",
                ha="center",
                va="bottom",
                fontweight="bold",
            )

        ax.set_title(
            "Model Performance Comparison\nWeighted F1 Scores",
            fontsize=16,
            fontweight="bold",
            pad=20,
        )
        ax.set_ylabel("Weighted F1 Score", fontsize=14, fontweight="bold")
        ax.set_xlabel("Model", fontsize=14, fontweight="bold")
        ax.set_ylim(0, max(f1_scores) * 1.15)

        plt.xticks(rotation=45, ha="right")
        plt.grid(axis="y", alpha=0.3)
        plt.tight_layout()
        plt.show()

        return fig

    def create_confusion_matrices(self):
        """Create confusion matrices for all models."""
        n_models = len(self.model_names)
        cols = 3
        rows = (n_models + cols - 1) // cols

        fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
        axes = axes.flatten() if n_models > 1 else [axes]

        for idx, model in enumerate(self.model_names):
            if idx >= len(axes):
                break

            true_labels = self.results[model]["true_labels"]
            pred_labels = self.results[model]["predictions"]

            cm = confusion_matrix(true_labels, pred_labels)
            cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

            ax = axes[idx]
            sns.heatmap(
                cm_normalized,
                annot=True,
                fmt=".2f",
                cmap="Blues",
                xticklabels=self.label_names,
                yticklabels=self.label_names,
                ax=ax,
                cbar_kws={"shrink": 0.8},
            )

            clean_name = self._clean_model_name(model)

            ax.set_title(
                f'{clean_name}\nF1: {self.results[model]["f1_score"]:.3f}',
                fontweight="bold",
            )
            ax.set_ylabel("True Label")
            ax.set_xlabel("Predicted Label")

        for idx in range(n_models, len(axes)):
            axes[idx].set_visible(False)

        fig.suptitle("Confusion Matrices (Normalized)", fontsize=16, fontweight="bold")
        plt.tight_layout()
        plt.show()

        return fig

    def create_class_performance_comparison(self):
        """Create detailed class-wise performance comparison."""
        metrics_data = []

        for model in self.model_names:
            if (
                model in self.detailed_results
                and "classification_report" in self.detailed_results[model]
            ):
                report = self.detailed_results[model]["classification_report"]

                for class_name in self.label_names:
                    if class_name in report:
                        metrics_data.append(
                            {
                                "Model": self._clean_model_name(model),
                                "Class": class_name,
                                "Precision": report[class_name]["precision"],
                                "Recall": report[class_name]["recall"],
                                "F1-Score": report[class_name]["f1-score"],
                            }
                        )

        df = pd.DataFrame(metrics_data)

        fig, axes = plt.subplots(1, 3, figsize=(18, 6))

        metrics = ["Precision", "Recall", "F1-Score"]

        for idx, metric in enumerate(metrics):
            ax = axes[idx]
            pivot_df = df.pivot(index="Model", columns="Class", values=metric)

            sns.heatmap(
                pivot_df,
                annot=True,
                fmt=".3f",
                cmap="RdYlBu_r",
                ax=ax,
                cbar_kws={"shrink": 0.8},
                vmin=0.5,
                vmax=1.0,
            )

            ax.set_title(f"{metric} by Class", fontweight="bold", fontsize=14)
            ax.set_xlabel("")
            ax.set_ylabel("Model" if idx == 0 else "")
            ax.set_yticklabels(ax.get_yticklabels(), rotation=0)

        fig.suptitle(
            "Per-Class Performance Metrics Across Models",
            fontsize=16,
            fontweight="bold",
        )
        plt.tight_layout()
        plt.show()

        return fig

    def create_error_analysis(self):
        """Analyze prediction errors across models."""
        error_data = []

        for model in self.model_names:
            true_labels = np.array(self.results[model]["true_labels"])
            pred_labels = np.array(self.results[model]["predictions"])

            for class_idx, class_name in enumerate(self.label_names):
                class_mask = true_labels == class_idx
                class_true = true_labels[class_mask]
                class_predictions = pred_labels[class_mask]

                if len(class_true) > 0:
                    error_rate = (class_predictions != class_true).mean()
                    error_data.append(
                        {
                            "Model": self._clean_model_name(model),
                            "Class": class_name,
                            "Error_Rate": error_rate,
                        }
                    )

        df_errors = pd.DataFrame(error_data)

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

        pivot_errors = df_errors.pivot(index="Model", columns="Class", values="Error_Rate")
        sns.heatmap(
            pivot_errors,
            annot=True,
            fmt=".3f",
            cmap="Reds",
            ax=ax1,
            cbar_kws={"shrink": 0.8},
        )
        ax1.set_title("Error Rates by Class", fontweight="bold", fontsize=14)
        ax1.set_ylabel("Model")

        accuracies = []
        model_names_clean = []

        for model in self.model_names:
            true_labels = np.array(self.results[model]["true_labels"])
            pred_labels = np.array(self.results[model]["predictions"])
            accuracy = (pred_labels == true_labels).mean()
            accuracies.append(accuracy)
            model_names_clean.append(self._clean_model_name(model))

        colors = sns.color_palette("plasma", len(model_names_clean))
        bars = ax2.barh(model_names_clean, accuracies, color=colors, alpha=0.8)

        for bar, acc in zip(bars, accuracies):
            width = bar.get_width()
            ax2.text(
                width + 0.005,
                bar.get_y() + bar.get_height() / 2.0,
                f"{acc:.3f}",
                ha="left",
                va="center",
                fontweight="bold",
            )

        ax2.set_title("Overall Accuracy Comparison", fontweight="bold", fontsize=14)
        ax2.set_xlabel("Accuracy")
        ax2.set_xlim(0, max(accuracies) * 1.1)
        ax2.grid(axis="x", alpha=0.3)

        fig.suptitle("Error Analysis Across Models", fontsize=16, fontweight="bold")
        plt.tight_layout()
        plt.show()

        return fig

    def create_model_ranking_summary(self):
        """Create a comprehensive ranking summary."""
        summary_data = []

        for model in self.model_names:
            true_labels = np.array(self.results[model]["true_labels"])
            pred_labels = np.array(self.results[model]["predictions"])

            accuracy = (pred_labels == true_labels).mean()
            f1_score = self.results[model]["f1_score"]

            if (
                model in self.detailed_results
                and "classification_report" in self.detailed_results[model]
            ):
                report = self.detailed_results[model]["classification_report"]
                neg_f1 = report.get("Negative", {}).get("f1-score", 0)
                neu_f1 = report.get("Neutral", {}).get("f1-score", 0)
                pos_f1 = report.get("Positive", {}).get("f1-score", 0)
            else:
                neg_f1 = neu_f1 = pos_f1 = 0

            clean_name = self._clean_model_name(model)

            summary_data.append(
                {
                    "Model": clean_name,
                    "Accuracy": accuracy,
                    "Weighted F1": f1_score,
                    "Negative F1": neg_f1,
                    "Neutral F1": neu_f1,
                    "Positive F1": pos_f1,
                }
            )

        df_summary = pd.DataFrame(summary_data)
        df_summary = df_summary.sort_values("Weighted F1", ascending=False)

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

        ax1.axis("tight")
        ax1.axis("off")

        table_data = df_summary.round(3)
        table = ax1.table(
            cellText=table_data.values,
            colLabels=table_data.columns,
            cellLoc="center",
            loc="center",
        )
        table.auto_set_font_size(False)
        table.set_fontsize(10)
        table.scale(1.2, 2)

        for i in range(len(table_data.columns)):
            table[(0, i)].set_facecolor("#40466e")
            table[(0, i)].set_text_props(weight="bold", color="white")

        for i in range(1, len(table_data) + 1):
            if i == 1:
                for j in range(len(table_data.columns)):
                    table[(i, j)].set_facecolor("#d4edda")
            elif i == len(table_data):
                for j in range(len(table_data.columns)):
                    table[(i, j)].set_facecolor("#f8d7da")

        ax1.set_title(
            "Model Performance Ranking\n(Sorted by Weighted F1)",
            fontweight="bold",
            fontsize=14,
            pad=20,
        )

        top_3_models = df_summary.head(3)
        categories = [
            "Accuracy",
            "Weighted F1",
            "Negative F1",
            "Neutral F1",
            "Positive F1",
        ]

        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
        angles += angles[:1]

        ax2 = plt.subplot(122, projection="polar")
        colors = ["#FF6B6B", "#4ECDC4", "#45B7D1"]

        for idx, (_, row) in enumerate(top_3_models.iterrows()):
            values = [
                row["Accuracy"],
                row["Weighted F1"],
                row["Negative F1"],
                row["Neutral F1"],
                row["Positive F1"],
            ]
            values += values[:1]

            ax2.plot(
                angles,
                values,
                "o-",
                linewidth=2,
                label=row["Model"],
                color=colors[idx],
                alpha=0.8,
            )
            ax2.fill(angles, values, alpha=0.15, color=colors[idx])

        ax2.set_xticks(angles[:-1])
        ax2.set_xticklabels(categories)
        ax2.set_ylim(0, 1)
        ax2.set_title(
            "Top 3 Models - Performance Radar",
            fontweight="bold",
            fontsize=14,
            pad=30,
        )
        ax2.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
        ax2.grid(True)

        plt.tight_layout()
        plt.show()

        return fig, df_summary

    def generate_all_visualizations(self):
        """Generate all visualizations."""
        print("🎨 Generating Model Comparison Visualizations...")
        print("=" * 60)

        print("\n1. F1 Score Comparison")
        self.create_f1_comparison()

        print("\n2. Confusion Matrices")
        self.create_confusion_matrices()

        print("\n3. Class-wise Performance")
        self.create_class_performance_comparison()

        print("\n4. Error Analysis")
        self.create_error_analysis()

        print("\n5. Model Ranking Summary")
        fig, summary_df = self.create_model_ranking_summary()

        print("\n" + "=" * 60)
        print("📊 FINAL RECOMMENDATIONS")
        print("=" * 60)

        best_model = summary_df.iloc[0]["Model"]
        best_f1 = summary_df.iloc[0]["Weighted F1"]

        print(f"🏆 Best Overall Model: {best_model}")
        print(f"📈 Best F1 Score: {best_f1:.4f}")

        print("\n📝 Key Insights:")
        print(f"• {summary_df.iloc[0]['Model']} shows the best overall performance")
        print(
            f"• Accuracy range: {summary_df['Accuracy'].min():.3f} - {summary_df['Accuracy'].max():.3f}"
        )
        print(
            f"• F1 range: {summary_df['Weighted F1'].min():.3f} - {summary_df['Weighted F1'].max():.3f}"
        )

        if summary_df["Neutral F1"].min() < 0.7:
            print("• Neutral class appears challenging for all models")

        return summary_df

In [None]:
# Run the visualizations
if __name__ == "__main__":
    try:
        visualizer = ModelVisualizer()
        summary = visualizer.generate_all_visualizations()
    except FileNotFoundError as e:
        print(
            "Error: Could not find results files. Make sure model training completed successfully."
        )
        print("Looking for: model_results.json and detailed_results.pkl")
    except Exception as e:
        print(f"Error creating visualizations: {e}")

In [None]:
roberta_hyperparameters = TrainingArguments(
    output_dir="./roberta_results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=500,
    fp16=torch.cuda.is_available(),
    report_to="none",
    disable_tqdm=False,
    seed=42,
    dataloader_pin_memory=True,
    gradient_checkpointing=True,
    lr_scheduler_type="cosine",
    warmup_steps=100,
)

# Train the RoBERTa model using the custom hyperparameters
print("RE-RUNNING ROBERTA MODEL WITH EXTERNAL HYPERPARAMETERS")
roberta_results = trainer.train_and_evaluate_model("roberta-base", roberta_hyperparameters)

In [None]:
# Define the model to visualize
model_name = "roberta-base"
label_names = ["Negative", "Neutral", "Positive"]

try:
    # Load the results from the saved files
    with open("model_results.json", "r") as f:
        results = json.load(f)

    with open("detailed_results.pkl", "rb") as f:
        detailed_results = pickle.load(f)

    if model_name in results and model_name in detailed_results:
        print(f"\n Generating visualizations for the {model_name} model...")

        # --- Confusion Matrix Visualization ---
        fig, ax = plt.subplots(1, 1, figsize=(8, 6))

        true_labels = np.array(results[model_name]["true_labels"])
        pred_labels = np.array(results[model_name]["predictions"])

        # Create normalized confusion matrix
        cm = confusion_matrix(true_labels, pred_labels)
        cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

        # Plot heatmap
        sns.heatmap(
            cm_normalized,
            annot=True,
            fmt=".2f",
            cmap="Blues",
            xticklabels=label_names,
            yticklabels=label_names,
            ax=ax,
        )

        ax.set_title(
            "Confusion Matrix for RoBERTa-base (Normalized)", fontweight="bold"
        )
        ax.set_ylabel("True Label")
        ax.set_xlabel("Predicted Label")

        plt.tight_layout()
        plt.show()

        # --- Error Rate Calculation and Display ---
        print("\n📈 Error Analysis:")
        # Calculate overall accuracy
        accuracy = (pred_labels == true_labels).mean()
        print(f"• Overall Accuracy: {accuracy:.4f}")

        # Calculate per-class error rates
        per_class_errors = {}
        for class_idx, class_name in enumerate(label_names):
            class_mask = true_labels == class_idx
            if np.any(class_mask):
                class_predictions = pred_labels[class_mask]
                error_rate = (class_predictions != class_idx).mean()
                per_class_errors[class_name] = error_rate

        print("• Per-Class Error Rates:")
        for class_name, rate in per_class_errors.items():
            print(f"  - {class_name}: {rate:.4f}")

    else:
        print(
            f"Error: Results for model '{model_name}' not found. Please ensure the model was trained and the results files exist."
        )

except FileNotFoundError:
    print(
        "Error: Could not find 'model_results.json' or 'detailed_results.pkl'. Please ensure the model training and evaluation step was completed successfully."
    )
except Exception as e:
    print(f"An error occurred: {e}")