In [1]:
import os
import json
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, RobertaTokenizer, RobertaModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from custom_models.RobertaWithSpeakerSegmentationCollate import RobertaWithSpeakerSegmentationCollate
from custom_models.RobertaWithSpeakerSegmentationModel import (
    RobertaWithSpeakerSegmentationConfig,
    RobertaWithSpeakerSegmentationForSequenceClassification
)
from utils import SPECIAL_TOKENS

# ✅ Detect device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ✅ Paths
project_root = os.path.abspath("..")
DATASET_PATH = os.path.join(project_root, "datasets", "processed_dataset.csv")
METADATA_PATH = os.path.join(project_root, "datasets", "metadata.jsonl")
BEST_MODELS_PATH = "best_speaker_segmentation_models.json"

MODELS = ["roberta-base", "s-nlp/roberta_toxicity_classifier"]
F1_TOLERANCE = 1e-2

# ✅ Load dataset
csv_data = pd.read_csv(DATASET_PATH, encoding="utf-8")
with open(METADATA_PATH, "r", encoding="utf-8") as f:
    metadata_data = [json.loads(line.strip()) for line in f]

assert len(csv_data) == len(metadata_data), f"CSV and metadata length mismatch: {len(csv_data)} vs {len(metadata_data)}"

dataset_rows = []
for i in range(len(csv_data)):
    row = {
        "input_text": csv_data.iloc[i]["input_text"],
        "label": int(csv_data.iloc[i]["label"]),
        "metadata": metadata_data[i]
    }
    dataset_rows.append(row)

dataset = Dataset.from_list(dataset_rows)
dataset = dataset.train_test_split(test_size=0.20)

# ✅ Best models tracker
def load_best_models():
    if not os.path.exists(BEST_MODELS_PATH):
        result = {}
        for model_checkpoint in MODELS:
            model_name = f"{model_checkpoint.replace('/', '-')}-speaker-segmentation"
            result[model_name] = {
                "f1": 0.0,
                "precision": 0.0,
                "recall": 0.0,
                "accuracy": 0.0
            }
        with open(BEST_MODELS_PATH, "w") as f:
            json.dump(result, f, indent=2)
        return result
    with open(BEST_MODELS_PATH, "r") as f:
        return json.load(f)

def update_best_model(model_name, f1_score, precision, recall, accuracy):
    best_models = load_best_models()
    current_best = best_models.get(model_name, {"f1": 0.0, "precision": 0.0})
    better_f1 = f1_score - F1_TOLERANCE > current_best["f1"]
    similar_f1 = abs(f1_score - current_best["f1"]) <= F1_TOLERANCE
    better_precision = precision > current_best["precision"]
    should_update = better_f1 or (similar_f1 and better_precision)
    if should_update:
        print(f"🎯 New best for {model_name}!")
        best_models[model_name] = {
            "f1": f1_score,
            "precision": precision,
            "recall": recall,
            "accuracy": accuracy
        }
        with open(BEST_MODELS_PATH, "w") as f:
            json.dump(best_models, f, indent=2)
        return True
    else:
        print(f"🧪 {model_name} did not improve (f1: {f1_score:.4f}, prev: {current_best['f1']:.4f})")
        return False

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ✅ Training loop
results = {}
best_f1 = -1
best_model_name = None

for model_checkpoint in MODELS:
    model_name = f"{model_checkpoint.replace('/', '-')}-speaker-segmentation"
    print(f"\n🚀 Training model: {model_name}")

    tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
    special_tokens_dict = {"additional_special_tokens": SPECIAL_TOKENS}
    num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

    config = RobertaWithSpeakerSegmentationConfig.from_pretrained(model_checkpoint)
    config.num_labels = 2

    model = RobertaWithSpeakerSegmentationForSequenceClassification(config)
    base_model = RobertaModel.from_pretrained(model_checkpoint)
    base_state_dict = base_model.state_dict()

    # Transfer weights
    missing, unexpected = model.roberta_with_speaker.load_state_dict(base_state_dict, strict=False)
    print("Missing:", missing)
    print("Unexpected:", unexpected)

    if num_added_tokens > 0:
        print(f"🪄 Resizing token embeddings to {len(tokenizer)} (added {num_added_tokens} tokens)")
        model.resize_token_embeddings(len(tokenizer))

    model.to(device)
    collator = RobertaWithSpeakerSegmentationCollate(tokenizer=tokenizer, max_length=512)

    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        warmup_ratio=0.0,
        lr_scheduler_type="linear",
        logging_dir=f"./logs_{model_name}",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        optim="adamw_torch",
        adam_epsilon=1e-8,
        remove_unused_columns=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[model_name] = eval_result

    if update_best_model(
        model_name=model_name,
        f1_score=eval_result["eval_f1"],
        precision=eval_result["eval_precision"],
        recall=eval_result["eval_recall"],
        accuracy=eval_result["eval_accuracy"]
    ):
        trainer.save_model(f"models/best-{model_name}")

    if eval_result["eval_f1"] > best_f1:
        best_f1 = eval_result["eval_f1"]
        best_model_name = model_name

print("\n--- Summary of Results ---")
for name, result in results.items():
    print(f"{name}: F1 = {result['eval_f1']:.4f}")

print(f"\n🏆 Best model: {best_model_name} (F1 = {best_f1:.4f})")


Using device: mps

🚀 Training model: roberta-base-speaker-segmentation


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Missing: ['embeddings.player_ID_embeddings.weight', 'embeddings.team_ID_embeddings.weight', 'embeddings.message_context_embeddings.weight']
Unexpected: []
🪄 Resizing token embeddings to 50277 (added 12 tokens)


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.689313,0.611111,0.73913,0.435897,0.548387
2,No log,0.660864,0.62963,0.616352,0.837607,0.710145
3,No log,0.683581,0.611111,0.648649,0.615385,0.631579
4,No log,0.649258,0.62037,0.627737,0.735043,0.677165
5,No log,0.659908,0.625,0.704545,0.529915,0.604878
6,No log,0.656615,0.62037,0.710843,0.504274,0.59
7,No log,0.650229,0.615741,0.673469,0.564103,0.613953


KeyboardInterrupt: 