In [8]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datetime import datetime
import os
import json

In [9]:
# ✅ Detect MPS
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [10]:
project_root = os.path.abspath("..")

In [11]:
DATASET_PATH = os.path.join(project_root, "datasets/processed_split_toxicity_data.csv")
df = pd.read_csv(DATASET_PATH)
dataset = Dataset.from_pandas(df)

def tokenize_split(example):
    return tokenizer(example["message"], example["context"], padding="max_length", truncation=True, max_length=512)

def tokenize_message_only(example):
    return tokenizer(example["message"], padding="max_length", truncation=True, max_length=512)

tokenize_funcs = {
    "split": tokenize_split,
    "message_only": tokenize_message_only
}

In [12]:
BEST_MODELS_PATH = "best_models.json"
F1_TOLERANCE = 1e-2

def load_best_models():
    if not os.path.exists(BEST_MODELS_PATH):
        return {
            'bert-base-uncased-split': {'f1': 0.0, 'precision': 0.0, "recall": 0.0, "accuracy": 0.0},
            'bert-base-uncased-message_only': {'f1': 0.0, 'precision': 0.0, "recall": 0.0, "accuracy": 0.0},
            'bert-large-uncased-split': {'f1': 0.0, 'precision': 0.0, "recall": 0.0, "accuracy": 0.0},
            'bert-large-uncased-message_only': {'f1': 0.0, 'precision': 0.0, "recall": 0.0, "accuracy": 0.0},
            's-nlp-roberta-toxicity-classifier-split': {'f1': 0.0, 'precision': 0.0, "recall": 0.0, "accuracy": 0.0},
            's-nlp-roberta-toxicity-classifier-message_only': {'f1': 0.0, 'precision': 0.0, "recall": 0.0, "accuracy": 0.0}
        }
    with open(BEST_MODELS_PATH, "r") as f:
        return json.load(f)

In [13]:
def update_best_model(model_name, f1_score, precision, recall, accuracy):
    best_models = load_best_models()
    current_best = best_models.get(model_name, {"f1": 0.0, "precision": 0.0})

    better_f1 = f1_score - F1_TOLERANCE > current_best["f1"]
    similar_f1 = abs(f1_score - current_best["f1"]) <= F1_TOLERANCE
    better_precision = precision > current_best["precision"]

    should_update = better_f1 or (similar_f1 and better_precision)

    if should_update:
        print(f"🎯 New best for {model_name}!")
        print(f"F1: {f1_score:.4f} (prev: {current_best['f1']:.4f}) | Precision: {precision:.4f} (prev: {current_best['precision']:.4f})")
        best_models[model_name] = {
            "f1": f1_score,
            "precision": precision,
            "recall": recall,
            "accuracy": accuracy,
        }
        with open(BEST_MODELS_PATH, "w") as f:
            json.dump(best_models, f, indent=2)
        return True
    else:
        print(f"🧪 {model_name} did not improve:")
        print(f"F1: {f1_score:.4f} (best: {current_best['f1']:.4f}) | Precision: {precision:.4f} (best: {current_best['precision']:.4f})")
        return False

In [14]:
from transformers import EarlyStoppingCallback

results = {}
best_f1 = -1
best_model = None
best_version = ""

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("s-nlp/roberta_toxicity_classifier")

for version, tokenizer_func in tokenize_funcs.items():
    print(f"\nTraining version: {version}")
    tokenized = dataset.map(tokenizer_func, batched=True)
    tokenized = tokenized.train_test_split(test_size=0.25)
    tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    model_name = f"s-nlp-roberta-toxicity-classifier-{version}"
    model = RobertaForSequenceClassification.from_pretrained("s-nlp/roberta_toxicity_classifier", num_labels=2)
    model.to(device)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
        return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",             # Evaluate every epoch
        save_strategy="epoch",                   # Save every epoch
        learning_rate=2e-5,                      # From table
        per_device_train_batch_size=16,          # From table
        per_device_eval_batch_size=16,           # From table
        num_train_epochs=10,                      # From table
        weight_decay=0.01,                       # From table
        warmup_ratio=0.0,                        # Table didn't specify, assume 0 unless specified
        lr_scheduler_type="linear",              # From table
        logging_dir="./logs",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        optim="adamw_torch",                     # Explicitly using AdamW
        adam_epsilon=1e-8                        # From table
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[version] = eval_result

    if update_best_model(
        model_name=model_name,
        f1_score=eval_result["eval_f1"],
        precision=eval_result["eval_precision"],
        recall=eval_result["eval_recall"],
        accuracy=eval_result["eval_accuracy"]
    ):
        trainer.save_model(f"models/best-{model_name}")

    if eval_result["eval_f1"] > best_f1:
        best_f1 = eval_result["eval_f1"]
        best_model = model
        best_version = version


# Print summary
print("\n--- Summary of Results ---")
for version, result in results.items():
    print(f"{version.upper()}: F1 = {result['eval_f1']:.4f}")

print(f"\nBest model: {best_version.upper()} (F1 = {best_f1:.4f})")


Training version: split


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.571396,0.691406,0.913793,0.417323,0.572973
2,No log,0.538514,0.773438,0.759398,0.795276,0.776923
3,No log,0.545714,0.785156,0.785714,0.779528,0.782609
4,No log,0.732543,0.777344,0.773438,0.779528,0.776471
5,No log,0.800461,0.773438,0.763359,0.787402,0.775194


🧪 s-nlp-roberta-toxicity-classifier-split did not improve:
F1: 0.7826 (best: 0.7935) | Precision: 0.7857 (best: 0.7967)

Training version: message_only


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Some weights of the model checkpoint at s-nlp/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.515206,0.761719,0.892857,0.590551,0.7109
2,No log,0.496944,0.75,0.760331,0.724409,0.741935
3,No log,0.53638,0.765625,0.791304,0.716535,0.752066
4,No log,0.781476,0.746094,0.738462,0.755906,0.747082
5,No log,0.731762,0.757812,0.733813,0.80315,0.766917
6,No log,0.92767,0.78125,0.759124,0.818898,0.787879
7,No log,0.964376,0.78125,0.779528,0.779528,0.779528
8,No log,1.089668,0.777344,0.801724,0.732283,0.765432


🎯 New best for s-nlp-roberta-toxicity-classifier-message_only!
F1: 0.7879 (prev: 0.7729) | Precision: 0.7591 (prev: 0.7823)

--- Summary of Results ---
SPLIT: F1 = 0.7826
MESSAGE_ONLY: F1 = 0.7879

Best model: MESSAGE_ONLY (F1 = 0.7879)
