In [22]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import json

In [23]:
# ✅ Detect MPS
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [24]:
project_root = os.path.abspath("..")
# Load dataset

In [25]:
dataset_version = {
    "joined": os.path.join(project_root, "datasets/processed_joined_toxicity_data.csv"),
    "split": os.path.join(project_root, "datasets/processed_split_toxicity_data.csv"),
    "message_only": os.path.join(project_root, "datasets/processed_split_toxicity_data.csv")
}

# Three tokenizers
def tokenize_joined(example):
    return tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=512)

def tokenize_split(example):
    return tokenizer(example["context"], example["message"], padding="max_length", truncation=True, max_length=512)

def tokenize_message_only(example):
    return tokenizer(example["message"], padding="max_length", truncation=True, max_length=512)

tokenize_funcs = {
    "joined": tokenize_joined,
    "split": tokenize_split,
    "message_only": tokenize_message_only
}

In [26]:
BEST_MODELS_PATH = "best_models.json"
F1_TOLERANCE = 1e-2

def load_best_models():
    if not os.path.exists(BEST_MODELS_PATH):
        return {
            'bert-base-uncased-joined': {'f1': 0.0, 'precision': 0.0},
            'bert-base-uncased-split': {'f1': 0.0, 'precision': 0.0},
            'bert-base-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
            'bert-large-uncased-joined': {'f1': 0.0, 'precision': 0.0},
            'bert-large-uncased-split': {'f1': 0.0, 'precision': 0.0},
            'bert-large-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
            's-nlp-roberta-toxicity-classifier-joined': {'f1': 0.0, 'precision': 0.0},
            's-nlp-roberta-toxicity-classifier-split': {'f1': 0.0, 'precision': 0.0},
            's-nlp-roberta-toxicity-classifier-message_only': {'f1': 0.0, 'precision': 0.0}
        }
    with open(BEST_MODELS_PATH, "r") as f:
        return json.load(f)

load_best_models()

{'bert-base-uncased-joined': {'f1': 0.723618, 'precision': 0.705882},
 'bert-base-uncased-split': {'f1': 0.780952, 'precision': 0.836735},
 'bert-base-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
 'bert-large-uncased-joined': {'f1': 0.0, 'precision': 0.0},
 'bert-large-uncased-split': {'f1': 0.0, 'precision': 0.0},
 'bert-large-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
 's-nlp/roberta_toxicity_classifier-joined': {'f1': 0.0, 'precision': 0.0},
 's-nlp/roberta_toxicity_classifier-split': {'f1': 0.0, 'precision': 0.0},
 's-nlp/roberta_toxicity_classifier-message_only': {'f1': 0.0,
  'precision': 0.0}}

In [27]:
def update_best_model(model_name, f1_score, precision):
    best_models = load_best_models()
    current_best = best_models.get(model_name, {"f1": 0.0, "precision": 0.0})

    better_f1 = f1_score - F1_TOLERANCE > current_best["f1"]
    similar_f1 = abs(f1_score - current_best["f1"]) <= F1_TOLERANCE
    better_precision = precision > current_best["precision"]

    should_update = better_f1 or (similar_f1 and better_precision)

    if should_update:
        print(f"🎯 New best for {model_name}!")
        print(f"F1: {f1_score:.4f} (prev: {current_best['f1']:.4f}) | Precision: {precision:.4f} (prev: {current_best['precision']:.4f})")
        best_models[model_name] = {
            "f1": f1_score,
            "precision": precision,
        }
        with open(BEST_MODELS_PATH, "w") as f:
            json.dump(best_models, f, indent=2)
        return True
    else:
        print(f"🧪 {model_name} did not improve:")
        print(f"F1: {f1_score:.4f} (best: {current_best['f1']:.4f}) | Precision: {precision:.4f} (best: {current_best['precision']:.4f})")
        return False

In [29]:
results = {}
best_f1 = -1
best_model = None
best_version = ""

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

for version, tokenizer_func in tokenize_funcs.items():
    print(f"\nTraining version: {version}")
    df = pd.read_csv(dataset_version[version])  # Must contain 'input_text', 'context', 'message', 'label'
    dataset = Dataset.from_pandas(df)
    tokenized = dataset.map(tokenizer_func, batched=True)
    tokenized = tokenized.train_test_split(test_size=0.2)
    tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    model_name = f"bert-base-uncased-{version}"
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model.to(device)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
        return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",             # Evaluate every epoch
        save_strategy="epoch",                   # Save every epoch
        learning_rate=2e-5,                      # From table
        per_device_train_batch_size=16,          # From table
        per_device_eval_batch_size=16,           # From table
        num_train_epochs=5,                      # From table
        weight_decay=0.01,                       # From table
        warmup_ratio=0.0,                        # Table didn't specify, assume 0 unless specified
        lr_scheduler_type="linear",              # From table
        logging_dir="./logs",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        optim="adamw_torch",                     # Explicitly using AdamW
        adam_epsilon=1e-8                        # From table
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[version] = eval_result

    if update_best_model(
        model_name=model_name,
        f1_score=eval_result["eval_f1"],
        precision=eval_result["eval_precision"],
    ):
        trainer.save_model(f"models/best-{model_name}")

    if eval_result["eval_f1"] > best_f1:
        best_f1 = eval_result["eval_f1"]
        best_model = model
        best_version = version

# Print summary
print("\n--- Summary of Results ---")
for version, result in results.items():
    print(f"{version.upper()}: F1 = {result['eval_f1']:.4f}")

print(f"\nBest model: {best_version.upper()} (F1 = {best_f1:.4f})")


Training version: joined


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.655288,0.65625,0.657658,0.651786,0.654709
2,No log,0.557238,0.696429,0.923077,0.428571,0.585366
3,No log,0.518171,0.767857,0.767857,0.767857,0.767857
4,No log,0.521136,0.758929,0.795918,0.696429,0.742857
5,No log,0.488975,0.772321,0.769912,0.776786,0.773333


🎯 New best for bert-base-uncased-joined!
F1: 0.7733 (prev: 0.7236) | Precision: 0.7699 (prev: 0.7059)

Training version: split


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.670258,0.616071,0.577381,0.866071,0.692857
2,No log,0.630874,0.625,0.791667,0.339286,0.475
3,No log,0.55165,0.763393,0.855422,0.633929,0.728205
4,No log,0.511898,0.794643,0.817308,0.758929,0.787037
5,No log,0.49713,0.776786,0.798077,0.741071,0.768519


🎯 New best for bert-base-uncased-split!
F1: 0.7870 (prev: 0.7810) | Precision: 0.8173 (prev: 0.8367)

Training version: message_only


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.605693,0.696429,0.652778,0.839286,0.734375
2,No log,0.55418,0.745536,0.752294,0.732143,0.742081
3,No log,0.574846,0.714286,0.785714,0.589286,0.673469
4,No log,0.570888,0.727679,0.768421,0.651786,0.705314
5,No log,0.578952,0.727679,0.768421,0.651786,0.705314


🎯 New best for bert-base-uncased-message_only!
F1: 0.7421 (prev: 0.0000) | Precision: 0.7523 (prev: 0.0000)

--- Summary of Results ---
JOINED: F1 = 0.7733
SPLIT: F1 = 0.7870
MESSAGE_ONLY: F1 = 0.7421

Best model: SPLIT (F1 = 0.7870)
