In [1]:
import os
import json
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    DebertaV2Tokenizer,
    DebertaV2ForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback, DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from utils import SPECIAL_TOKENS

In [2]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [3]:
# Ensure consistent results
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [4]:
project_root = os.path.abspath("..")

In [5]:
DATASET_PATH = os.path.join(project_root, "datasets/processed_dataset.csv")

df = pd.read_csv(DATASET_PATH)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=SEED)

# Convert to HF Dataset
dataset = {
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
}

In [6]:
from collections import Counter

label_counts = {
    "train": Counter(train_df['label']),
    "validation": Counter(val_df['label']),
    "test": Counter(test_df['label'])
}


for name, batch in label_counts.items():
    print(name)
    for label, count in batch.items():
        total = sum(batch.values())
        print(f"Label {label}: {count} samples ({count/total:.2%})")

train
Label 0: 3900 samples (46.41%)
Label 1: 4504 samples (53.59%)
validation
Label 1: 563 samples (53.62%)
Label 0: 487 samples (46.38%)
test
Label 0: 488 samples (46.43%)
Label 1: 563 samples (53.57%)


In [7]:
def tokenized(example):
    return tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_token_type_ids=True
    )

In [8]:
BEST_MODELS_PATH = "best_models.json"
F1_TOLERANCE = 1e-3

def load_best_models():
    if not os.path.exists(BEST_MODELS_PATH):
        return {}
    with open(BEST_MODELS_PATH, "r") as f:
        return json.load(f)


In [9]:
def update_best_model(model_name, f1_score, precision, recall, accuracy):
    best_models = load_best_models()
    current_best = best_models.get(model_name, {"f1": 0.0, "precision": 0.0})

    better_f1 = f1_score - F1_TOLERANCE > current_best["f1"]
    similar_f1 = abs(f1_score - current_best["f1"]) <= F1_TOLERANCE
    better_precision = precision > current_best["precision"]

    if better_f1 or (similar_f1 and better_precision):
        print(f"🎯 New best for {model_name}!")
        best_models[model_name] = {
            "f1": f1_score,
            "precision": precision,
            "recall": recall,
            "accuracy": accuracy,
        }
        with open(BEST_MODELS_PATH, "w") as f:
            json.dump(best_models, f, indent=2)
        return True
    else:
        print(f"🧪 {model_name} did not improve:")
        print(f"F1: {f1_score:.4f} (best: {current_best['f1']:.4f}) | Precision: {precision:.4f} (best: {current_best['precision']:.4f})")
        return False


In [10]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [11]:
def get_training_args(model_id, seed=SEED):
    return {
        "microsoft/deberta-v3-small": TrainingArguments(
            output_dir=f"./results_{model_id.replace('/', '-')}",
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=32,
            num_train_epochs=10,
            weight_decay=0.01,
            warmup_ratio=0.06,
            logging_dir=f"./logs_{model_id.replace('/', '-')}",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            save_total_limit=1,
            optim="adamw_torch",
            adam_epsilon=1e-8,
            lr_scheduler_type="linear",
            seed=seed,
            fp16=True,
        ),
        "microsoft/deberta-v3-base": TrainingArguments(
            output_dir=f"./results_{model_id.replace('/', '-')}",
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=1e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=8,
            weight_decay=0.01,
            warmup_ratio=0.1,
            logging_dir=f"./logs_{model_id.replace('/', '-')}",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            save_total_limit=1,
            optim="adamw_torch",
            adam_epsilon=1e-8,
            lr_scheduler_type="linear",
            seed=seed,
            fp16=True,
        ),
        "microsoft/deberta-base": TrainingArguments(
            output_dir=f"./results_{model_id.replace('/', '-')}",
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=1e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=10,
            weight_decay=0.01,
            warmup_ratio=0.1,
            logging_dir=f"./logs_{model_id.replace('/', '-')}",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            save_total_limit=1,
            optim="adamw_torch",
            adam_epsilon=1e-8,
            lr_scheduler_type="linear",
            seed=seed,
            fp16=True,
        ),
    }[model_id]

In [12]:
from transformers import DataCollatorWithPadding

MODELS = [
    "microsoft/deberta-v3-small", 
    "microsoft/deberta-v3-base", 
    "microsoft/deberta-base"
]

results = {}
best_f1 = -1
best_model_name = None

for model_id in MODELS:
    model_tag = model_id.replace("/", "-").replace("_", "-")
    model_name = f"{model_tag}-tokenized"
    print(f"\n🚀 Training model: {model_name}")

    tokenizer = DebertaV2Tokenizer.from_pretrained(model_id)
    tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})

    tokenized_dataset = {
        k: v.map(tokenized, batched=True) for k, v in dataset.items()
    }
    for split in tokenized_dataset:
        tokenized_dataset[split].set_format(
            type="torch",
            columns=["input_ids", "attention_mask", "token_type_ids", "label"]
        )

    model = DebertaV2ForSequenceClassification.from_pretrained(model_id, num_labels=2)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    training_args = get_training_args(model_id)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3 if model_id == "microsoft/deberta-v3-base" else 2)]
    )

    trainer.train()
    eval_result = trainer.evaluate(eval_dataset=tokenized_dataset["test"])

    print(eval_result["eval_f1"], eval_result["eval_precision"])
    
    results[model_name] = eval_result

    if update_best_model(
        model_name=f"{model_name}",
        f1_score=eval_result["eval_f1"],
        precision=eval_result["eval_precision"],
        recall=eval_result["eval_recall"],
        accuracy=eval_result["eval_accuracy"]
    ):
        trainer.save_model(f"models/best-{model_name}")

    if eval_result["eval_f1"] > best_f1:
        best_f1 = eval_result["eval_f1"]
        best_model_name = model_name

# ✅ Summary
print("\n--- Summary of Results ---")
for name, result in results.items():
    print(f"{name}: F1 = {result['eval_f1']:.4f}")

print(f"\n🏆 Best model: {best_model_name} (F1 = {best_f1:.4f})")


🚀 Training model: microsoft-deberta-v3-small-tokenized


Map:   0%|          | 0/8404 [00:00<?, ? examples/s]

Map:   0%|          | 0/1050 [00:00<?, ? examples/s]

Map:   0%|          | 0/1051 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(


ValueError: fp16 mixed precision requires a GPU (not 'mps').