In [1]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import json

In [2]:
# ✅ Detect MPS or fallback to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [3]:
project_root = os.path.abspath("..")

In [4]:
# Load dataset
dataset_version = {
    "joined": os.path.join(project_root, "datasets/processed_joined_toxicity_data.csv"),
    "split": os.path.join(project_root, "datasets/processed_split_toxicity_data.csv"),
    "message_only": os.path.join(project_root, "datasets/processed_split_toxicity_data.csv")
}

# Tokenizer (for BERT large)
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

# Tokenization functions
def tokenize_joined(example):
    return tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=512)

def tokenize_split(example):
    return tokenizer(example["context"], example["message"], padding="max_length", truncation=True, max_length=512)

def tokenize_message_only(example):
    return tokenizer(example["message"], padding="max_length", truncation=True, max_length=512)

tokenize_funcs = {
    "joined": tokenize_joined,
    "split": tokenize_split,
    "message_only": tokenize_message_only
}

In [5]:
BEST_MODELS_PATH = "best_models.json"
F1_TOLERANCE = 1e-2

def load_best_models():
    if not os.path.exists(BEST_MODELS_PATH):
        return {
            'bert-base-uncased-joined': {'f1': 0.0, 'precision': 0.0},
            'bert-base-uncased-split': {'f1': 0.0, 'precision': 0.0},
            'bert-base-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
            'bert-large-uncased-joined': {'f1': 0.0, 'precision': 0.0},
            'bert-large-uncased-split': {'f1': 0.0, 'precision': 0.0},
            'bert-large-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
            's-nlp-roberta-toxicity-classifier-joined': {'f1': 0.0, 'precision': 0.0},
            's-nlp-roberta-toxicity-classifier-split': {'f1': 0.0, 'precision': 0.0},
            's-nlp-roberta-toxicity-classifier-message_only': {'f1': 0.0, 'precision': 0.0}
        }
    with open(BEST_MODELS_PATH, "r") as f:
        return json.load(f)

load_best_models()

{'bert-base-uncased-joined': {'f1': 0.7733333333333333,
  'precision': 0.7699115044247787},
 'bert-base-uncased-split': {'f1': 0.7870370370370371,
  'precision': 0.8173076923076923},
 'bert-base-uncased-message_only': {'f1': 0.7420814479638009,
  'precision': 0.7522935779816514},
 'bert-large-uncased-joined': {'f1': 0.7888446215139442,
  'precision': 0.7734375},
 'bert-large-uncased-split': {'f1': 0.7358490566037735, 'precision': 0.78},
 'bert-large-uncased-message_only': {'f1': 0.0, 'precision': 0.0},
 's-nlp/roberta_toxicity_classifier-joined': {'f1': 0.7410071942446043,
  'precision': 0.6688311688311688},
 's-nlp/roberta_toxicity_classifier-split': {'f1': 0.7908745247148289,
  'precision': 0.8387096774193549},
 's-nlp/roberta_toxicity_classifier-message_only': {'f1': 0.7394366197183099,
  'precision': 0.7241379310344828}}

In [6]:
def update_best_model(model_name, f1_score, precision):
    best_models = load_best_models()
    current_best = best_models.get(model_name, {"f1": 0.0, "precision": 0.0})

    better_f1 = f1_score - F1_TOLERANCE > current_best["f1"]
    similar_f1 = abs(f1_score - current_best["f1"]) <= F1_TOLERANCE
    better_precision = precision > current_best["precision"]

    should_update = better_f1 or (similar_f1 and better_precision)

    if should_update:
        print(f"🎯 New best for {model_name}!")
        print(f"F1: {f1_score:.4f} (prev: {current_best['f1']:.4f}) | Precision: {precision:.4f} (prev: {current_best['precision']:.4f})")
        best_models[model_name] = {
            "f1": f1_score,
            "precision": precision,
        }
        with open(BEST_MODELS_PATH, "w") as f:
            json.dump(best_models, f, indent=2)
        return True
    else:
        print(f"🧪 {model_name} did not improve:")
        print(f"F1: {f1_score:.4f} (best: {current_best['f1']:.4f}) | Precision: {precision:.4f} (best: {current_best['precision']:.4f})")
        return False

In [None]:
# Result tracking
results = {}
best_f1 = -1
best_model = None
best_version = ""

# Training loop
for version, tokenizer_func in tokenize_funcs.items():
    print(f"\n🔄 Training version: {version.upper()}")

    df = pd.read_csv(dataset_version[version])
    dataset = Dataset.from_pandas(df)
    tokenized = dataset.map(tokenizer_func, batched=True)
    tokenized = tokenized.train_test_split(test_size=0.2)
    tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    model_name = f"bert-large-uncased-{version}"
    model = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2)
    model.to(device)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
        return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",             # Evaluate every epoch
        save_strategy="epoch",                   # Save every epoch
        learning_rate=2e-5,                      # From table
        per_device_train_batch_size=16,          # From table
        per_device_eval_batch_size=16,           # From table
        num_train_epochs=5,                      # From table
        weight_decay=0.01,                       # From table
        warmup_ratio=0.0,                        # Table didn't specify, assume 0 unless specified
        lr_scheduler_type="linear",              # From table
        logging_dir="./logs",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        optim="adamw_torch",                     # Explicitly using AdamW
        adam_epsilon=1e-8                        # From table
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[version] = eval_result

    if update_best_model(
        model_name=model_name,
        f1_score=eval_result["eval_f1"],
        precision=eval_result["eval_precision"],
    ):
        trainer.save_model(f"models/best-{model_name}")

    if eval_result["eval_f1"] > best_f1:
        best_f1 = eval_result["eval_f1"]
        best_model = model
        best_version = version

# Final summary
print("\n--- Summary of Results (BERT-LARGE) ---")
for version, result in results.items():
    print(f"{version.upper()}: F1 = {result['eval_f1']:.4f}")

print(f"\n✅ Best model: {best_version.upper()} (F1 = {best_f1:.4f})")



🔄 Training version: JOINED


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.686194,0.625,0.675676,0.609756,0.641026
2,No log,0.689851,0.616071,0.690722,0.544715,0.609091
3,No log,0.682356,0.660714,0.72381,0.617886,0.666667
4,No log,0.620164,0.714286,0.725191,0.772358,0.748031
5,No log,0.615807,0.723214,0.740157,0.764228,0.752


🧪 bert-large-uncased-joined did not improve:
F1: 0.7520 (best: 0.7888) | Precision: 0.7402 (best: 0.7734)

🔄 Training version: SPLIT


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,,0.5,0.0,0.0,0.0
