In [1]:
import os
import json
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    DebertaV2Tokenizer,
    DebertaV2ForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback, DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from utils import SPECIAL_TOKENS

In [2]:
import random


def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(42)

In [3]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [4]:
project_root = os.path.abspath("..")

In [5]:
DATASET_PATH = os.path.join(project_root, "datasets/processed_dataset.csv")
df = pd.read_csv(DATASET_PATH)
dataset = Dataset.from_pandas(df)

In [6]:
def tokenized(example):
    return tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_token_type_ids=True
    )

In [7]:
BEST_MODELS_PATH = "best_models.json"
F1_TOLERANCE = 1e-2

def load_best_models():
    if not os.path.exists(BEST_MODELS_PATH):
        return {}
    with open(BEST_MODELS_PATH, "r") as f:
        return json.load(f)


In [8]:
def update_best_model(model_name, f1_score, precision, recall, accuracy):
    best_models = load_best_models()
    current_best = best_models.get(model_name, {"f1": 0.0, "precision": 0.0})

    better_f1 = f1_score - F1_TOLERANCE > current_best["f1"]
    similar_f1 = abs(f1_score - current_best["f1"]) <= F1_TOLERANCE
    better_precision = precision > current_best["precision"]

    if better_f1 or (similar_f1 and better_precision):
        print(f"🎯 New best for {model_name}!")
        best_models[model_name] = {
            "f1": f1_score,
            "precision": precision,
            "recall": recall,
            "accuracy": accuracy,
        }
        with open(BEST_MODELS_PATH, "w") as f:
            json.dump(best_models, f, indent=2)
        return True
    else:
        print(f"🧪 {model_name} did not improve:")
        print(f"F1: {f1_score:.4f} (best: {current_best['f1']:.4f}) | Precision: {precision:.4f} (best: {current_best['precision']:.4f})")
        return False


In [9]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [10]:
MODELS = ["microsoft/deberta-v3-small", "microsoft/deberta-v3-base", "microsoft/deberta-base"]
results = {}
best_f1 = -1
best_model_name = None

for model_id in MODELS:
    model_tag = model_id.replace("/", "-").replace("_", "-")
    model_name = f"{model_tag}-tokenized"
    print(f"\n🚀 Training model: {model_name}")

    tokenizer = DebertaV2Tokenizer.from_pretrained(model_id)
    tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})

    tokenized_dataset = dataset.map(tokenized, batched=True)
    tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.25, seed=42)
    tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

    model = DebertaV2ForSequenceClassification.from_pretrained(model_id, num_labels=2)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        warmup_ratio=0.0,
        lr_scheduler_type="linear",
        logging_dir=f"./logs_{model_name}",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        optim="adamw_torch",
        adam_epsilon=1e-8
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[model_name] = eval_result

    if update_best_model(
        model_name=f"{model_name}",
        f1_score=eval_result["eval_f1"],
        precision=eval_result["eval_precision"],
        recall=eval_result["eval_recall"],
        accuracy=eval_result["eval_accuracy"]
    ):
        trainer.save_model(f"models/best-{model_name}")

    if eval_result["eval_f1"] > best_f1:
        best_f1 = eval_result["eval_f1"]
        best_model_name = model_name

# ✅ Summary
print("\n--- Summary of Results ---")
for name, result in results.items():
    print(f"{name}: F1 = {result['eval_f1']:.4f}")

print(f"\n🏆 Best model: {best_model_name} (F1 = {best_f1:.4f})")


🚀 Training model: microsoft-deberta-v3-small-tokenized


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.587997,0.722222,0.765766,0.634328,0.693878
2,No log,0.590947,0.692593,0.652695,0.813433,0.724252
3,No log,0.534155,0.725926,0.723881,0.723881,0.723881
4,No log,0.569133,0.718519,0.741667,0.664179,0.700787


🎯 New best for microsoft-deberta-v3-small-tokenized!

🚀 Training model: microsoft-deberta-v3-base-tokenized


Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.620152,0.714815,0.761468,0.619403,0.683128
2,No log,0.579499,0.707407,0.72,0.671642,0.694981
3,No log,0.640787,0.696296,0.776596,0.544776,0.640351
4,No log,0.541733,0.740741,0.766667,0.686567,0.724409
5,No log,0.556673,0.737037,0.744186,0.716418,0.730038
6,No log,0.551947,0.744444,0.744361,0.738806,0.741573
7,No log,0.582096,0.714815,0.756757,0.626866,0.685714
8,No log,0.574513,0.707407,0.706767,0.701493,0.70412


🎯 New best for microsoft-deberta-v3-base-tokenized!

🚀 Training model: microsoft-deberta-base-tokenized


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DebertaTokenizer'. 
The class this function is called from is 'DebertaV2Tokenizer'.


ImportError: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
