In [1]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datetime import datetime
import os
import json

In [2]:
# ✅ Detect MPS
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [3]:
project_root = os.path.abspath("..")

In [4]:
# Load dataset
dataset_version = {
    "joined": os.path.join(project_root, "datasets/processed_joined_toxicity_data.csv"),
    "split": os.path.join(project_root, "datasets/processed_split_toxicity_data.csv"),
    "message_only": os.path.join(project_root, "datasets/processed_split_toxicity_data.csv")
}

# Three tokenizers
def tokenize_joined(example):
    return tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=512)

def tokenize_split(example):
    return tokenizer(example["context"], example["message"], padding="max_length", truncation=True, max_length=512)

def tokenize_message_only(example):
    return tokenizer(example["message"], padding="max_length", truncation=True, max_length=512)

tokenize_funcs = {
    "joined": tokenize_joined,
    "split": tokenize_split,
    "message_only": tokenize_message_only
}

In [5]:
BEST_MODELS_PATH = "best_models.json"
F1_TOLERANCE = 1e-2

def load_best_models():
    if not os.path.exists(BEST_MODELS_PATH):
        return {
            'bert-base-uncased-joined': {'f1': 0.0, 'precision': 0.0},
            'bert-base-uncased-split': {'f1': 0.0, 'precision': 0.0},
            'bert-base-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
            'bert-large-uncased-joined': {'f1': 0.0, 'precision': 0.0},
            'bert-large-uncased-split': {'f1': 0.0, 'precision': 0.0},
            'bert-large-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
            's-nlp-roberta-toxicity-classifier-joined': {'f1': 0.0, 'precision': 0.0},
            's-nlp-roberta-toxicity-classifier-split': {'f1': 0.0, 'precision': 0.0},
            's-nlp-roberta-toxicity-classifier-message_only': {'f1': 0.0, 'precision': 0.0}
        }
    with open(BEST_MODELS_PATH, "r") as f:
        return json.load(f)

load_best_models()

{'bert-base-uncased-joined': {'f1': 0.7733333333333333,
  'precision': 0.7699115044247787},
 'bert-base-uncased-split': {'f1': 0.7870370370370371,
  'precision': 0.8173076923076923},
 'bert-base-uncased-message_only': {'f1': 0.7420814479638009,
  'precision': 0.7522935779816514},
 'bert-large-uncased-joined': {'f1': 0.7888446215139442,
  'precision': 0.7734375},
 'bert-large-uncased-split': {'f1': 0.7358490566037735, 'precision': 0.78},
 'bert-large-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
 's-nlp/roberta_toxicity_classifier-joined': {'f1': 0.0, 'precision': 0.0},
 's-nlp/roberta_toxicity_classifier-split': {'f1': 0.0, 'precision': 0.0},
 's-nlp/roberta_toxicity_classifier-message_only': {'f1': 0.0,
  'precision': 0.0}}

In [6]:
def update_best_model(model_name, f1_score, precision):
    best_models = load_best_models()
    current_best = best_models.get(model_name, {"f1": 0.0, "precision": 0.0})

    better_f1 = f1_score - F1_TOLERANCE > current_best["f1"]
    similar_f1 = abs(f1_score - current_best["f1"]) <= F1_TOLERANCE
    better_precision = precision > current_best["precision"]

    should_update = better_f1 or (similar_f1 and better_precision)

    if should_update:
        print(f"🎯 New best for {model_name}!")
        print(f"F1: {f1_score:.4f} (prev: {current_best['f1']:.4f}) | Precision: {precision:.4f} (prev: {current_best['precision']:.4f})")
        best_models[model_name] = {
            "f1": f1_score,
            "precision": precision,
        }
        with open(BEST_MODELS_PATH, "w") as f:
            json.dump(best_models, f, indent=2)
        return True
    else:
        print(f"🧪 {model_name} did not improve:")
        print(f"F1: {f1_score:.4f} (best: {current_best['f1']:.4f}) | Precision: {precision:.4f} (best: {current_best['precision']:.4f})")
        return False

In [7]:
results = {}
best_f1 = -1
best_model = None
best_version = ""

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("s-nlp/roberta_toxicity_classifier")

for version, tokenizer_func in tokenize_funcs.items():
    print(f"\nTraining version: {version}")
    df = pd.read_csv(dataset_version[version])  # Must contain 'input_text', 'context', 'message', 'label'
    dataset = Dataset.from_pandas(df)
    tokenized = dataset.map(tokenizer_func, batched=True)
    tokenized = tokenized.train_test_split(test_size=0.25)
    tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    model_name = f"s-nlp-roberta-toxicity-classifier-{version}"
    model = RobertaForSequenceClassification.from_pretrained("s-nlp/roberta_toxicity_classifier", num_labels=2)
    model.to(device)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
        return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",             # Evaluate every epoch
        save_strategy="epoch",                   # Save every epoch
        learning_rate=2e-5,                      # From table
        per_device_train_batch_size=16,          # From table
        per_device_eval_batch_size=16,           # From table
        num_train_epochs=5,                      # From table
        weight_decay=0.01,                       # From table
        warmup_ratio=0.0,                        # Table didn't specify, assume 0 unless specified
        lr_scheduler_type="linear",              # From table
        logging_dir="./logs",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        optim="adamw_torch",                     # Explicitly using AdamW
        adam_epsilon=1e-8                        # From table
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[version] = eval_result

    if update_best_model(
        model_name=model_name,
        f1_score=eval_result["eval_f1"],
        precision=eval_result["eval_precision"],
    ):
        trainer.save_model(f"models/best-{model_name}")

    if eval_result["eval_f1"] > best_f1:
        best_f1 = eval_result["eval_f1"]
        best_model = model
        best_version = version

# Print summary
print("\n--- Summary of Results ---")
for version, result in results.items():
    print(f"{version.upper()}: F1 = {result['eval_f1']:.4f}")

print(f"\nBest model: {best_version.upper()} (F1 = {best_f1:.4f})")


Training version: joined


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

Some weights of the model checkpoint at s-nlp/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.623082,0.666667,0.582011,0.887097,0.702875
2,No log,0.590743,0.698925,0.633333,0.766129,0.693431
3,No log,0.63875,0.738351,0.673469,0.798387,0.730627
4,No log,0.649872,0.741935,0.668831,0.830645,0.741007
5,No log,0.646317,0.741935,0.675676,0.806452,0.735294


🎯 New best for s-nlp/roberta_toxicity_classifier-joined!
F1: 0.7410 (prev: 0.0000) | Precision: 0.6688 (prev: 0.0000)

Training version: split


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.546193,0.738351,0.8,0.633094,0.706827
2,No log,0.505058,0.777778,0.808,0.726619,0.765152
3,No log,0.512265,0.802867,0.83871,0.748201,0.790875
4,No log,0.631335,0.756272,0.779528,0.71223,0.744361
5,No log,0.66171,0.763441,0.787402,0.719424,0.75188


🎯 New best for s-nlp/roberta_toxicity_classifier-split!
F1: 0.7909 (prev: 0.0000) | Precision: 0.8387 (prev: 0.0000)

Training version: message_only


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

Some weights of the model checkpoint at s-nlp/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.603617,0.655914,0.731183,0.489209,0.586207
2,No log,0.57347,0.741935,0.813084,0.625899,0.707317
3,No log,0.665454,0.734767,0.724138,0.755396,0.739437
4,No log,0.751208,0.734767,0.755906,0.690647,0.721805
5,No log,0.731646,0.74552,0.769841,0.697842,0.732075


🎯 New best for s-nlp/roberta_toxicity_classifier-message_only!
F1: 0.7394 (prev: 0.0000) | Precision: 0.7241 (prev: 0.0000)

--- Summary of Results ---
JOINED: F1 = 0.7410
SPLIT: F1 = 0.7909
MESSAGE_ONLY: F1 = 0.7394

Best model: SPLIT (F1 = 0.7909)
