In [None]:
import os
import json
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from utils import SPECIAL_TOKENS

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
project_root = os.path.abspath("..")

In [None]:
DATASET_PATH = os.path.join(project_root, "datasets/processed_dataset.csv")
df = pd.read_csv(DATASET_PATH)
dataset = Dataset.from_pandas(df)

In [None]:
def tokenized(example):
    return tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=512)

In [None]:
BEST_MODELS_PATH = "best_models.json"
F1_TOLERANCE = 1e-2

def load_best_models():
    if not os.path.exists(BEST_MODELS_PATH):
        return {}
    with open(BEST_MODELS_PATH, "r") as f:
        return json.load(f)


In [None]:
def update_best_model(model_name, f1_score, precision, recall, accuracy):
    best_models = load_best_models()
    current_best = best_models.get(model_name, {"f1": 0.0, "precision": 0.0})

    better_f1 = f1_score - F1_TOLERANCE > current_best["f1"]
    similar_f1 = abs(f1_score - current_best["f1"]) <= F1_TOLERANCE
    better_precision = precision > current_best["precision"]

    if better_f1 or (similar_f1 and better_precision):
        print(f"🎯 New best for {model_name}!")
        best_models[model_name] = {
            "f1": f1_score,
            "precision": precision,
            "recall": recall,
            "accuracy": accuracy,
        }
        with open(BEST_MODELS_PATH, "w") as f:
            json.dump(best_models, f, indent=2)
        return True
    else:
        print(f"🧪 {model_name} did not improve:")
        print(f"F1: {f1_score:.4f} (best: {current_best['f1']:.4f}) | Precision: {precision:.4f} (best: {current_best['precision']:.4f})")
        return False


In [None]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
MODELS = ["roberta-base", "s-nlp/roberta_toxicity_classifier", "roberta-large"]
results = {}
best_f1 = -1
best_model_name = None

for model_id in MODELS:
    model_tag = model_id.replace("/", "-").replace("_", "-")
    model_name = f"{model_tag}-tokenized"
    print(f"\n🚀 Training model: {model_name}")

    tokenizer = RobertaTokenizer.from_pretrained(model_id)
    tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})

    tokenized_dataset = dataset.map(tokenized, batched=True)
    tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.25)
    tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    model = RobertaForSequenceClassification.from_pretrained(model_id, num_labels=2)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        warmup_ratio=0.0,
        lr_scheduler_type="linear",
        logging_dir=f"./logs_{model_name}",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        optim="adamw_torch",
        adam_epsilon=1e-8
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[model_name] = eval_result

    if update_best_model(
        model_name=f"{model_name}",
        f1_score=eval_result["eval_f1"],
        precision=eval_result["eval_precision"],
        recall=eval_result["eval_recall"],
        accuracy=eval_result["eval_accuracy"]
    ):
        trainer.save_model(f"models/best-{model_name}")

    if eval_result["eval_f1"] > best_f1:
        best_f1 = eval_result["eval_f1"]
        best_model_name = model_name

# ✅ Summary
print("\n--- Summary of Results ---")
for name, result in results.items():
    print(f"{name}: F1 = {result['eval_f1']:.4f}")

print(f"\n🏆 Best model: {best_model_name} (F1 = {best_f1:.4f})")