Imports

In [None]:
%pip install -q transformers datasets accelerate evaluate optuna wandb scikit-learn optuna

import os, sys, math, re, random, json, time
import numpy as np
import pandas as pd
import kagglehub
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

print("Python:", sys.version)
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("MPS available (Apple Silicon):", torch.backends.mps.is_available())

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    DEVICE = torch.device("cpu")
print("Using device:", DEVICE)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/247.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hPython: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
PyTorch: 2.8.0+cu126
CUDA available: True
MPS available (Apple Silicon): False
Using device: cuda


**Get data**

In [None]:
path = kagglehub.dataset_download("datatattle/covid-19-nlp-text-classification")
raw_df = pd.read_csv(os.path.join(path, "Corona_NLP_train.csv"), encoding="latin1")
test_df = pd.read_csv(os.path.join(path, "Corona_NLP_test.csv"), encoding="latin1")

**Clean data and tokenization**

In [None]:
def clean_text(text: str) -> str:
    """
    Minimal tweet cleaning: remove URLs, mentions, hashtags, and extra spaces.
    Keep it simple to avoid losing sentiment cues.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

LABEL_MAP = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}

raw_df["CleanedTweet"] = raw_df["OriginalTweet"].astype(str).apply(clean_text)
raw_df["label"] = raw_df["Sentiment"].map(LABEL_MAP)
raw_df = raw_df.dropna(subset=["CleanedTweet", "label"])

print("Samples:", len(raw_df))
print(raw_df[["Sentiment", "label"]].head())

from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(
    raw_df[["CleanedTweet","label"]],
    test_size=0.1,
    random_state=42,
    stratify=raw_df["label"]
)

print("Train:", len(train_df), "Val:", len(val_df))

Samples: 41157
            Sentiment  label
0             Neutral      2
1            Positive      3
2            Positive      3
3            Positive      3
4  Extremely Negative      0
Train: 37041 Val: 4116


In [None]:
MODEL_NAME_DISTILBERT = "distilbert-base-uncased"
MODEL_NAME_ROBERTA    = "roberta-base"

tokenizer_distilbert = AutoTokenizer.from_pretrained(MODEL_NAME_DISTILBERT)
tokenizer_roberta    = AutoTokenizer.from_pretrained(MODEL_NAME_ROBERTA)

class CoronaTweetsDataset(Dataset):
    """PyTorch Dataset for tokenized tweets."""
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_len)
        self.labels = list(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_ds_distilbert = CoronaTweetsDataset(train_df["CleanedTweet"], train_df["label"], tokenizer_distilbert)
val_ds_distilbert   = CoronaTweetsDataset(val_df["CleanedTweet"],   val_df["label"],   tokenizer_distilbert)
train_ds_roberta    = CoronaTweetsDataset(train_df["CleanedTweet"], train_df["label"], tokenizer_roberta)
val_ds_roberta      = CoronaTweetsDataset(val_df["CleanedTweet"],   val_df["label"],   tokenizer_roberta)

BATCH_TRAIN = 16
BATCH_EVAL  = 32
loader_train_distilbert = DataLoader(train_ds_distilbert, batch_size=BATCH_TRAIN, shuffle=True)
loader_val_distilbert   = DataLoader(val_ds_distilbert,   batch_size=BATCH_EVAL,  shuffle=False)
loader_train_roberta    = DataLoader(train_ds_roberta,    batch_size=BATCH_TRAIN, shuffle=True)
loader_val_roberta      = DataLoader(val_ds_roberta,      batch_size=BATCH_EVAL,  shuffle=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

**Fine Tuning - Full code**

learning_rate 1e-5 to 1e-3, weight_decay 1e-6 to 1e-4, batch_size 64 and 128

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.optim import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import optuna
import wandb
import pandas as pd
from torch.utils.data import DataLoader


def compute_metrics_np(y_true, y_pred):
    """Compute comprehensive metrics for evaluation."""
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    rec = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    try:
        auc = roc_auc_score(y_true, pd.get_dummies(y_pred).values, average="weighted", multi_class="ovr")
    except Exception:
        auc = float("nan")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}

def early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch,
                    current_val_accuracy, current_val_accuracy_epoch):
    """Check if early stopping condition is met."""
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch >= patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial):
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None

    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        model.eval()
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0
        all_val_labels = []
        all_val_preds = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_recall = recall_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)

        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
            patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
        )

        if val_accuracy == best_val_accuracy:
            best_model_state = model.state_dict()

        wandb.log({
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1
        })

        if early_stop_flag:
            break

    if best_model_state is not None:
        torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")

    return best_val_accuracy

def objective_distilbert(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    patience = trial.suggest_int("patience", 7, 10)
    batch_size = trial.suggest_categorical("batch_size", [64, 128])
    num_layers = trial.suggest_int("num_layers", 1, 3)

    train_dataset = CoronaTweetsDataset(train_df["CleanedTweet"], train_df["label"], tokenizer_distilbert)
    val_dataset = CoronaTweetsDataset(val_df["CleanedTweet"], val_df["label"], tokenizer_distilbert)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_DISTILBERT, num_labels=5).to(DEVICE)

    for param in model.distilbert.parameters():
        param.requires_grad = False

    total_layers = len(model.distilbert.transformer.layer)
    for layer_idx in range(max(0, total_layers - num_layers), total_layers):
        for param in model.distilbert.transformer.layer[layer_idx].parameters():
            param.requires_grad = True

    for param in model.classifier.parameters():
        param.requires_grad = True

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    wandb.init(project="corona-tweets-finetuning",
               config={
                   "learning_rate": learning_rate,
                   "weight_decay": weight_decay,
                   "patience": patience,
                   "batch_size": batch_size,
                   "num_layers": num_layers,
                   "architecture": "DistilBERT",
                   "dataset": "corona-tweets"
               },
               name=f"distilbert_trial_{trial.number}")

    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader,
                                                    optimizer, criterion, epochs=7,
                                                    patience=patience, trial=trial)

    wandb.finish()

    return best_val_accuracy

def objective_roberta(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    patience = trial.suggest_int("patience", 7, 10)
    batch_size = trial.suggest_categorical("batch_size", [64, 128])
    num_layers = trial.suggest_int("num_layers", 1, 3)

    train_dataset = CoronaTweetsDataset(train_df["CleanedTweet"], train_df["label"], tokenizer_roberta)
    val_dataset = CoronaTweetsDataset(val_df["CleanedTweet"], val_df["label"], tokenizer_roberta)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_ROBERTA, num_labels=5).to(DEVICE)

    for param in model.roberta.parameters():
        param.requires_grad = False

    total_layers = len(model.roberta.encoder.layer)
    for layer_idx in range(max(0, total_layers - num_layers), total_layers):
        for param in model.roberta.encoder.layer[layer_idx].parameters():
            param.requires_grad = True

    for param in model.classifier.parameters():
        param.requires_grad = True

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    wandb.init(project="corona-tweets-finetuning",
               config={
                   "learning_rate": learning_rate,
                   "weight_decay": weight_decay,
                   "patience": patience,
                   "batch_size": batch_size,
                   "num_layers": num_layers,
                   "architecture": "RoBERTa",
                   "dataset": "corona-tweets"
               },
               name=f"roberta_trial_{trial.number}")

    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader,
                                                    optimizer, criterion, epochs=7,
                                                    patience=patience, trial=trial)

    wandb.finish()

    return best_val_accuracy

def evaluate_model(model_path, model_name, test_loader):
    """Function to evaluate the model on test set."""
    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
    model.load_state_dict(torch.load(model_path))
    model = model.to(DEVICE)
    model.eval()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = logits.argmax(dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    metrics = compute_metrics_np(all_labels, all_preds)
    return metrics


print("\n=== Optimizing DistilBERT ===")
study_distilbert = optuna.create_study(direction="maximize")
study_distilbert.optimize(objective_distilbert, n_trials=2)

print("\n=== Optimizing RoBERTa ===")
study_roberta = optuna.create_study(direction="maximize")
study_roberta.optimize(objective_roberta, n_trials=2)

print("\n=== Best Parameters ===")
print("DistilBERT best params:", study_distilbert.best_params)
print("DistilBERT best value:", study_distilbert.best_value)
print("RoBERTa best params:", study_roberta.best_params)
print("RoBERTa best value:", study_roberta.best_value)

[I 2025-08-19 08:37:07,995] A new study created in memory with name: no-name-2e6895a6-c182-4b57-831e-ed58e2042a2c



=== Optimizing DistilBERT ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▅▆▇█
Train Accuracy,▁▄▅▆▇▇█
Train Loss,█▅▄▄▃▂▁
Validation Accuracy,▁▄▄▇██▇
Validation F1,▁▄▄██▇▇
Validation Loss,█▅▄▁▁▂▅
Validation Precision,▁▅▄█▇▇█
Validation Recall,▁▄▄▇██▇

0,1
Epoch,7.0
Train Accuracy,0.77433
Train Loss,0.56364
Validation Accuracy,0.65938
Validation F1,0.66281
Validation Loss,0.89061
Validation Precision,0.67912
Validation Recall,0.65938


0,1
Epoch,▁▂▃▅▆▇█
Train Accuracy,▁▄▅▆▇▇█
Train Loss,█▅▄▃▂▂▁
Validation Accuracy,▁▄▅▆▇▇█
Validation F1,▁▄▅▆▇▇█
Validation Loss,█▅▄▃▂▁▁
Validation Precision,▁▄▅▆▇▇█
Validation Recall,▁▄▅▆▇▇█

0,1
Epoch,7.0
Train Accuracy,0.75908
Train Loss,0.62388
Validation Accuracy,0.70578
Validation F1,0.70619
Validation Loss,0.76177
Validation Precision,0.70938
Validation Recall,0.70578


[I 2025-08-19 09:02:14,462] Trial 0 finished with value: 0.70578231292517 and parameters: {'learning_rate': 1.6626888576426607e-05, 'weight_decay': 5.126688313334231e-06, 'patience': 7, 'batch_size': 64, 'num_layers': 2}. Best is trial 0 with value: 0.70578231292517.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▅▆▇█
Train Accuracy,▁▄▅▆▇▇█
Train Loss,█▆▄▃▃▂▁
Validation Accuracy,▁▄▆▇███
Validation F1,▁▄▆▇███
Validation Loss,█▄▂▁▁▂▃
Validation Precision,▁▄▇▇▇██
Validation Recall,▁▄▆▇███

0,1
Epoch,7.0
Train Accuracy,0.85732
Train Loss,0.37923
Validation Accuracy,0.72376
Validation F1,0.72565
Validation Loss,0.78481
Validation Precision,0.73373
Validation Recall,0.72376


[I 2025-08-19 09:27:17,242] Trial 1 finished with value: 0.7274052478134111 and parameters: {'learning_rate': 4.059868271641817e-05, 'weight_decay': 1.6576622786125564e-06, 'patience': 8, 'batch_size': 64, 'num_layers': 2}. Best is trial 1 with value: 0.7274052478134111.
[I 2025-08-19 09:27:17,243] A new study created in memory with name: no-name-27e13e80-28d6-4176-8cbf-e5a4085b15d7



=== Optimizing RoBERTa ===


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▅▆▇█
Train Accuracy,▁▄▅▆▇▇█
Train Loss,█▅▄▃▂▂▁
Validation Accuracy,▁▅▆▆▇▇█
Validation F1,▁▅▆▆▇▇█
Validation Loss,█▅▃▂▁▂▁
Validation Precision,▁▅▆▇█▇█
Validation Recall,▁▅▆▆▇▇█

0,1
Epoch,7.0
Train Accuracy,0.73024
Train Loss,0.67826
Validation Accuracy,0.67201
Validation F1,0.6701
Validation Loss,0.86633
Validation Precision,0.67246
Validation Recall,0.67201


[I 2025-08-19 10:08:23,052] Trial 0 finished with value: 0.6720116618075802 and parameters: {'learning_rate': 3.711473491605903e-05, 'weight_decay': 1.0328558205840956e-06, 'patience': 10, 'batch_size': 128, 'num_layers': 2}. Best is trial 0 with value: 0.6720116618075802.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▅▆▇█
Train Accuracy,▁▅▆▇▇▇█
Train Loss,█▄▃▃▂▂▁
Validation Accuracy,▁▄▅▆▇██
Validation F1,▁▄▅▆▇██
Validation Loss,█▆▅▃▂▂▁
Validation Precision,▁▄▅▆▇██
Validation Recall,▁▄▅▆▇██

0,1
Epoch,7.0
Train Accuracy,0.65209
Train Loss,0.86063
Validation Accuracy,0.62536
Validation F1,0.6238
Validation Loss,0.92087
Validation Precision,0.63036
Validation Recall,0.62536


[I 2025-08-19 10:49:21,858] Trial 1 finished with value: 0.6253644314868805 and parameters: {'learning_rate': 1.5096705326354252e-05, 'weight_decay': 1.943190362082204e-06, 'patience': 7, 'batch_size': 128, 'num_layers': 2}. Best is trial 0 with value: 0.6720116618075802.



=== Best Parameters ===
DistilBERT best params: {'learning_rate': 4.059868271641817e-05, 'weight_decay': 1.6576622786125564e-06, 'patience': 8, 'batch_size': 64, 'num_layers': 2}
DistilBERT best value: 0.7274052478134111
RoBERTa best params: {'learning_rate': 3.711473491605903e-05, 'weight_decay': 1.0328558205840956e-06, 'patience': 10, 'batch_size': 128, 'num_layers': 2}
RoBERTa best value: 0.6720116618075802


learning_rate  2e-4 to 6e-4, weight_decay 1e-6 to 1e-5, batch_size 128

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.optim import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import optuna
import wandb
import pandas as pd
from torch.utils.data import DataLoader


def compute_metrics_np(y_true, y_pred):
    """Compute comprehensive metrics for evaluation."""
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    rec = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    try:
        auc = roc_auc_score(y_true, pd.get_dummies(y_pred).values, average="weighted", multi_class="ovr")
    except Exception:
        auc = float("nan")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}

def early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch,
                    current_val_accuracy, current_val_accuracy_epoch):
    """Check if early stopping condition is met."""
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch >= patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial):
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None

    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        model.eval()
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0
        all_val_labels = []
        all_val_preds = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_recall = recall_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)

        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
            patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
        )

        if val_accuracy == best_val_accuracy:
            best_model_state = model.state_dict()

        wandb.log({
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1
        })

        if early_stop_flag:
            break

    if best_model_state is not None:
        torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")

    return best_val_accuracy

def objective_distilbert(trial):
    # learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    learning_rate = trial.suggest_float("learning_rate", 2e-4, 6e-4, log=True)
    # weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-5, log=True)
    patience = trial.suggest_int("patience", 7, 10)
    # batch_size = trial.suggest_categorical("batch_size", [64, 128])
    batch_size = trial.suggest_categorical("batch_size", [128])
    num_layers = trial.suggest_int("num_layers", 1, 3)

    train_dataset = CoronaTweetsDataset(train_df["CleanedTweet"], train_df["label"], tokenizer_distilbert)
    val_dataset = CoronaTweetsDataset(val_df["CleanedTweet"], val_df["label"], tokenizer_distilbert)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_DISTILBERT, num_labels=5).to(DEVICE)

    for param in model.distilbert.parameters():
        param.requires_grad = False

    total_layers = len(model.distilbert.transformer.layer)
    for layer_idx in range(max(0, total_layers - num_layers), total_layers):
        for param in model.distilbert.transformer.layer[layer_idx].parameters():
            param.requires_grad = True

    for param in model.classifier.parameters():
        param.requires_grad = True

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    wandb.init(project="corona-tweets-finetuning",
               config={
                   "learning_rate": learning_rate,
                   "weight_decay": weight_decay,
                   "patience": patience,
                   "batch_size": batch_size,
                   "num_layers": num_layers,
                   "architecture": "DistilBERT",
                   "dataset": "corona-tweets"
               },
               name=f"distilbert_trial_{trial.number}")

    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader,
                                                    optimizer, criterion, epochs=5,
                                                    patience=patience, trial=trial)

    wandb.finish()

    return best_val_accuracy

def objective_roberta(trial):
    # learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 3e-4, log=True)
    # weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-5, log=True)
    patience = trial.suggest_int("patience", 7, 10)
    batch_size = trial.suggest_categorical("batch_size", [128])
    # batch_size = trial.suggest_categorical("batch_size", [64, 128])
    num_layers = trial.suggest_int("num_layers", 1, 3)

    train_dataset = CoronaTweetsDataset(train_df["CleanedTweet"], train_df["label"], tokenizer_roberta)
    val_dataset = CoronaTweetsDataset(val_df["CleanedTweet"], val_df["label"], tokenizer_roberta)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_ROBERTA, num_labels=5).to(DEVICE)

    for param in model.roberta.parameters():
        param.requires_grad = False

    total_layers = len(model.roberta.encoder.layer)
    for layer_idx in range(max(0, total_layers - num_layers), total_layers):
        for param in model.roberta.encoder.layer[layer_idx].parameters():
            param.requires_grad = True

    for param in model.classifier.parameters():
        param.requires_grad = True

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    wandb.init(project="corona-tweets-finetuning",
               config={
                   "learning_rate": learning_rate,
                   "weight_decay": weight_decay,
                   "patience": patience,
                   "batch_size": batch_size,
                   "num_layers": num_layers,
                   "architecture": "RoBERTa",
                   "dataset": "corona-tweets"
               },
               name=f"roberta_trial_{trial.number}")

    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader,
                                                    optimizer, criterion, epochs=5,
                                                    patience=patience, trial=trial)

    wandb.finish()

    return best_val_accuracy

def evaluate_model(model_path, model_name, test_loader):
    """Function to evaluate the model on test set."""
    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
    model.load_state_dict(torch.load(model_path))
    model = model.to(DEVICE)
    model.eval()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = logits.argmax(dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    metrics = compute_metrics_np(all_labels, all_preds)
    return metrics


print("\n=== Optimizing DistilBERT ===")
study_distilbert = optuna.create_study(direction="maximize")
study_distilbert.optimize(objective_distilbert, n_trials=2)

print("\n=== Optimizing RoBERTa ===")
study_roberta = optuna.create_study(direction="maximize")
study_roberta.optimize(objective_roberta, n_trials=2)

print("\n=== Best Parameters ===")
print("DistilBERT best params:", study_distilbert.best_params)
print("DistilBERT best value:", study_distilbert.best_value)
print("RoBERTa best params:", study_roberta.best_params)
print("RoBERTa best value:", study_roberta.best_value)

[I 2025-08-19 10:49:21,890] A new study created in memory with name: no-name-4113e4e4-a390-4e53-97a2-f6d01a111b7f



=== Optimizing DistilBERT ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▄▃▂▁
Validation Accuracy,▁▄▅▇█
Validation F1,▁▄▅▆█
Validation Loss,█▅▄▃▁
Validation Precision,▁▅▇▆█
Validation Recall,▁▄▅▇█

0,1
Epoch,5.0
Train Accuracy,0.70203
Train Loss,0.73968
Validation Accuracy,0.66448
Validation F1,0.66674
Validation Loss,0.83487
Validation Precision,0.68808
Validation Recall,0.66448


[I 2025-08-19 11:04:19,461] Trial 0 finished with value: 0.6644800777453839 and parameters: {'learning_rate': 0.0005843428932035234, 'weight_decay': 3.0231010253454837e-06, 'patience': 7, 'batch_size': 128, 'num_layers': 1}. Best is trial 0 with value: 0.6644800777453839.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▅▃▂▁
Validation Accuracy,▁▂▂▇█
Validation F1,▁▂▃▇█
Validation Loss,█▆█▂▁
Validation Precision,▁▃▄██
Validation Recall,▁▂▂▇█

0,1
Epoch,5.0
Train Accuracy,0.89323
Train Loss,0.29508
Validation Accuracy,0.80442
Validation F1,0.80521
Validation Loss,0.53881
Validation Precision,0.80726
Validation Recall,0.80442


[I 2025-08-19 11:25:10,030] Trial 1 finished with value: 0.8044217687074829 and parameters: {'learning_rate': 0.0003201964668340656, 'weight_decay': 8.193666273888372e-06, 'patience': 7, 'batch_size': 128, 'num_layers': 3}. Best is trial 1 with value: 0.8044217687074829.
[I 2025-08-19 11:25:10,032] A new study created in memory with name: no-name-d031f3f3-7018-41e3-95e9-b24e74a7fe03



=== Optimizing RoBERTa ===


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▅▃▂▁
Validation Accuracy,▁▃▇█▆
Validation F1,▁▃▇█▆
Validation Loss,█▆▂▁▄
Validation Precision,▁▄▇█▇
Validation Recall,▁▃▇█▆

0,1
Epoch,5.0
Train Accuracy,0.75068
Train Loss,0.63684
Validation Accuracy,0.65962
Validation F1,0.65628
Validation Loss,0.87477
Validation Precision,0.66596
Validation Recall,0.65962


[I 2025-08-19 11:54:29,259] Trial 0 finished with value: 0.684645286686103 and parameters: {'learning_rate': 9.11417622219932e-05, 'weight_decay': 1.2730957784267977e-06, 'patience': 9, 'batch_size': 128, 'num_layers': 2}. Best is trial 0 with value: 0.684645286686103.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▄▃▂▁
Validation Accuracy,▁▆▇██
Validation F1,▁▆▇▇█
Validation Loss,█▄▂▂▁
Validation Precision,▁▆▆▇█
Validation Recall,▁▆▇██

0,1
Epoch,5.0
Train Accuracy,0.68084
Train Loss,0.7871
Validation Accuracy,0.64261
Validation F1,0.63939
Validation Loss,0.88786
Validation Precision,0.64852
Validation Recall,0.64261


[I 2025-08-19 12:26:53,988] Trial 1 finished with value: 0.6426141885325559 and parameters: {'learning_rate': 2.0743029121057295e-05, 'weight_decay': 7.150871183616759e-06, 'patience': 9, 'batch_size': 128, 'num_layers': 3}. Best is trial 0 with value: 0.684645286686103.



=== Best Parameters ===
DistilBERT best params: {'learning_rate': 0.0003201964668340656, 'weight_decay': 8.193666273888372e-06, 'patience': 7, 'batch_size': 128, 'num_layers': 3}
DistilBERT best value: 0.8044217687074829
RoBERTa best params: {'learning_rate': 9.11417622219932e-05, 'weight_decay': 1.2730957784267977e-06, 'patience': 9, 'batch_size': 128, 'num_layers': 2}
RoBERTa best value: 0.684645286686103


learning_rate 1e-5 to 3e-4, weight_decay 1e-6 to 1e-5, batch_size 128

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.optim import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import optuna
import wandb
import pandas as pd
from torch.utils.data import DataLoader


def compute_metrics_np(y_true, y_pred):
    """Compute comprehensive metrics for evaluation."""
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    rec = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    try:
        auc = roc_auc_score(y_true, pd.get_dummies(y_pred).values, average="weighted", multi_class="ovr")
    except Exception:
        auc = float("nan")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}

def early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch,
                    current_val_accuracy, current_val_accuracy_epoch):
    """Check if early stopping condition is met."""
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch >= patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial):
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None

    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        model.eval()
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0
        all_val_labels = []
        all_val_preds = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_recall = recall_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)

        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
            patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
        )

        if val_accuracy == best_val_accuracy:
            best_model_state = model.state_dict()

        wandb.log({
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1
        })

        if early_stop_flag:
            break

    if best_model_state is not None:
        torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")

    return best_val_accuracy

def objective_distilbert(trial):
    # learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    learning_rate = trial.suggest_float("learning_rate", 2e-4, 6e-4, log=True)
    # weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-5, log=True)
    patience = trial.suggest_int("patience", 7, 10)
    # batch_size = trial.suggest_categorical("batch_size", [64, 128])
    batch_size = trial.suggest_categorical("batch_size", [128])
    num_layers = trial.suggest_int("num_layers", 1, 3)

    train_dataset = CoronaTweetsDataset(train_df["CleanedTweet"], train_df["label"], tokenizer_distilbert)
    val_dataset = CoronaTweetsDataset(val_df["CleanedTweet"], val_df["label"], tokenizer_distilbert)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_DISTILBERT, num_labels=5).to(DEVICE)

    for param in model.distilbert.parameters():
        param.requires_grad = False

    total_layers = len(model.distilbert.transformer.layer)
    for layer_idx in range(max(0, total_layers - num_layers), total_layers):
        for param in model.distilbert.transformer.layer[layer_idx].parameters():
            param.requires_grad = True

    for param in model.classifier.parameters():
        param.requires_grad = True

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    wandb.init(project="corona-tweets-finetuning",
               config={
                   "learning_rate": learning_rate,
                   "weight_decay": weight_decay,
                   "patience": patience,
                   "batch_size": batch_size,
                   "num_layers": num_layers,
                   "architecture": "DistilBERT",
                   "dataset": "corona-tweets"
               },
               name=f"distilbert_trial_{trial.number}")

    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader,
                                                    optimizer, criterion, epochs=5,
                                                    patience=patience, trial=trial)

    wandb.finish()

    return best_val_accuracy

def objective_roberta(trial):
    # learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 3e-4, log=True)
    # weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-5, log=True)
    patience = trial.suggest_int("patience", 7, 10)
    batch_size = trial.suggest_categorical("batch_size", [128])
    # batch_size = trial.suggest_categorical("batch_size", [64, 128])
    num_layers = trial.suggest_int("num_layers", 1, 3)

    train_dataset = CoronaTweetsDataset(train_df["CleanedTweet"], train_df["label"], tokenizer_roberta)
    val_dataset = CoronaTweetsDataset(val_df["CleanedTweet"], val_df["label"], tokenizer_roberta)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_ROBERTA, num_labels=5).to(DEVICE)

    for param in model.roberta.parameters():
        param.requires_grad = False

    total_layers = len(model.roberta.encoder.layer)
    for layer_idx in range(max(0, total_layers - num_layers), total_layers):
        for param in model.roberta.encoder.layer[layer_idx].parameters():
            param.requires_grad = True

    for param in model.classifier.parameters():
        param.requires_grad = True

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    wandb.init(project="corona-tweets-finetuning",
               config={
                   "learning_rate": learning_rate,
                   "weight_decay": weight_decay,
                   "patience": patience,
                   "batch_size": batch_size,
                   "num_layers": num_layers,
                   "architecture": "RoBERTa",
                   "dataset": "corona-tweets"
               },
               name=f"roberta_trial_{trial.number}")

    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader,
                                                    optimizer, criterion, epochs=5,
                                                    patience=patience, trial=trial)

    wandb.finish()

    return best_val_accuracy

def evaluate_model(model_path, model_name, test_loader):
    """Function to evaluate the model on test set."""
    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
    model.load_state_dict(torch.load(model_path))
    model = model.to(DEVICE)
    model.eval()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = logits.argmax(dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    metrics = compute_metrics_np(all_labels, all_preds)
    return metrics


print("\n=== Optimizing DistilBERT ===")
study_distilbert = optuna.create_study(direction="maximize")
study_distilbert.optimize(objective_distilbert, n_trials=2)

print("\n=== Optimizing RoBERTa ===")
study_roberta = optuna.create_study(direction="maximize")
study_roberta.optimize(objective_roberta, n_trials=2)

print("\n=== Best Parameters ===")
print("DistilBERT best params:", study_distilbert.best_params)
print("DistilBERT best value:", study_distilbert.best_value)
print("RoBERTa best params:", study_roberta.best_params)
print("RoBERTa best value:", study_roberta.best_value)

[I 2025-08-19 12:26:54,020] A new study created in memory with name: no-name-6704aee6-d4b8-44fe-a170-c158f047bf5d



=== Optimizing DistilBERT ===


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▅▃▂▁
Validation Accuracy,▁▄▆█▇
Validation F1,▁▄▆█▇
Validation Loss,█▄▅▁▂
Validation Precision,▁▅▆█▇
Validation Recall,▁▄▆█▇

0,1
Epoch,5.0
Train Accuracy,0.88073
Train Loss,0.32523
Validation Accuracy,0.78984
Validation F1,0.79017
Validation Loss,0.59623
Validation Precision,0.79385
Validation Recall,0.78984


[I 2025-08-19 12:47:44,673] Trial 0 finished with value: 0.7983479105928085 and parameters: {'learning_rate': 0.0003929999204358724, 'weight_decay': 1.1841474864462186e-06, 'patience': 8, 'batch_size': 128, 'num_layers': 3}. Best is trial 0 with value: 0.7983479105928085.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▄▆▇█
Train Loss,█▅▃▂▁
Validation Accuracy,▁▄▆▆█
Validation F1,▁▄▅▆█
Validation Loss,█▄▂▁▂
Validation Precision,▁▃▅▅█
Validation Recall,▁▄▆▆█

0,1
Epoch,5.0
Train Accuracy,0.7279
Train Loss,0.68086
Validation Accuracy,0.67177
Validation F1,0.67361
Validation Loss,0.84513
Validation Precision,0.67917
Validation Recall,0.67177


[I 2025-08-19 13:02:43,465] Trial 1 finished with value: 0.6717687074829932 and parameters: {'learning_rate': 0.0003405060620436493, 'weight_decay': 5.225647506991129e-06, 'patience': 7, 'batch_size': 128, 'num_layers': 1}. Best is trial 0 with value: 0.7983479105928085.
[I 2025-08-19 13:02:43,467] A new study created in memory with name: no-name-e92c482c-2cfd-4b5f-9854-077105b04440



=== Optimizing RoBERTa ===


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▄▃▂▁
Validation Accuracy,▁▄▅▇█
Validation F1,▁▄▅▇█
Validation Loss,█▅▄▂▁
Validation Precision,▁▄▄▇█
Validation Recall,▁▄▅▇█

0,1
Epoch,5.0
Train Accuracy,0.63694
Train Loss,0.88399
Validation Accuracy,0.62828
Validation F1,0.62812
Validation Loss,0.92019
Validation Precision,0.63814
Validation Recall,0.62828


[I 2025-08-19 13:32:02,205] Trial 0 finished with value: 0.6282798833819242 and parameters: {'learning_rate': 2.0529580054490893e-05, 'weight_decay': 1.0839340397595588e-06, 'patience': 8, 'batch_size': 128, 'num_layers': 2}. Best is trial 0 with value: 0.6282798833819242.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▃▅▆█
Train Accuracy,▁▅▆▇█
Train Loss,█▄▃▂▁
Validation Accuracy,▁▃▅▇█
Validation F1,▁▃▅▇█
Validation Loss,█▆▃▁▁
Validation Precision,▁▄▄▇█
Validation Recall,▁▃▅▇█

0,1
Epoch,5.0
Train Accuracy,0.72431
Train Loss,0.69049
Validation Accuracy,0.69679
Validation F1,0.69754
Validation Loss,0.81372
Validation Precision,0.70586
Validation Recall,0.69679


[I 2025-08-19 14:04:27,375] Trial 1 finished with value: 0.6967930029154519 and parameters: {'learning_rate': 3.284382010232365e-05, 'weight_decay': 4.506428791123357e-06, 'patience': 9, 'batch_size': 128, 'num_layers': 3}. Best is trial 1 with value: 0.6967930029154519.



=== Best Parameters ===
DistilBERT best params: {'learning_rate': 0.0003929999204358724, 'weight_decay': 1.1841474864462186e-06, 'patience': 8, 'batch_size': 128, 'num_layers': 3}
DistilBERT best value: 0.7983479105928085
RoBERTa best params: {'learning_rate': 3.284382010232365e-05, 'weight_decay': 4.506428791123357e-06, 'patience': 9, 'batch_size': 128, 'num_layers': 3}
RoBERTa best value: 0.6967930029154519


Fintun HF

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
from scipy.special import softmax

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")


def compute_metrics_enhanced(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    try:
        probabilities = softmax(logits, axis=1)
        auc = roc_auc_score(labels, probabilities, average="weighted", multi_class="ovr")
    except Exception as e:
        print(f"AUC calculation failed: {e}")
        auc = 0.0

    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"],
        "recall": recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"],
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"],
        "auc": auc
    }

def compute_metrics_basic(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

def train_with_trainer(model_name, train_ds, val_ds, run_name, out_dir, enhanced_metrics=True, use_wandb=True):

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

    training_args = TrainingArguments(
    output_dir=out_dir,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    report_to=["wandb"] if use_wandb else "none",
    run_name=run_name if use_wandb else None,
    )

    compute_metrics_fn = compute_metrics_enhanced if enhanced_metrics else compute_metrics_basic

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics_fn
    )

    trainer.train()

    metrics = trainer.evaluate()
    print(f"{run_name} metrics:", metrics)

    return trainer

USE_WANDB = True

trainer_distilbert = train_with_trainer(
    MODEL_NAME_DISTILBERT,
    train_ds_distilbert,
    val_ds_distilbert,
    run_name="DistilBERT-Basic",
    out_dir="./test_trainer_distilbert",
    enhanced_metrics=True,
    use_wandb=USE_WANDB
)

trainer_roberta = train_with_trainer(
    MODEL_NAME_ROBERTA,
    train_ds_roberta,
    val_ds_roberta,
    run_name="RoBERTa-Enhanced",
    out_dir="./test_trainer_roberta",
    enhanced_metrics=True,
    use_wandb=USE_WANDB
)

final_distilbert = trainer_distilbert.model.to(DEVICE).eval()
final_roberta = trainer_roberta.model.to(DEVICE).eval()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5342,0.468491,0.829446,0.833201,0.829446,0.82986,0.968415
2,0.3861,0.44789,0.860301,0.864963,0.860301,0.860477,0.978296
3,0.2647,0.56453,0.867833,0.871581,0.867833,0.867615,0.977884
4,0.1737,0.583421,0.880224,0.881098,0.880224,0.880019,0.982771
5,0.1061,0.656035,0.879738,0.88011,0.879738,0.879582,0.982817


DistilBERT-Basic metrics: {'eval_loss': 0.5834210515022278, 'eval_accuracy': 0.88022351797862, 'eval_precision': 0.8810978028001043, 'eval_recall': 0.88022351797862, 'eval_f1': 0.8800194476766197, 'eval_auc': 0.9827713755062562, 'eval_runtime': 14.8678, 'eval_samples_per_second': 276.839, 'eval_steps_per_second': 34.639, 'epoch': 5.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.8232,0.81361,0.713071,0.74526,0.713071,0.712082,0.913576
2,0.6324,0.662826,0.78207,0.792246,0.78207,0.781805,0.949291
3,0.4912,0.602778,0.828717,0.830122,0.828717,0.828875,0.959332
4,0.3777,0.705106,0.829932,0.833518,0.829932,0.829249,0.966759
5,0.2745,0.645194,0.857143,0.857299,0.857143,0.856818,0.971706


RoBERTa-Enhanced metrics: {'eval_loss': 0.6451942920684814, 'eval_accuracy': 0.8571428571428571, 'eval_precision': 0.8572993014067517, 'eval_recall': 0.8571428571428571, 'eval_f1': 0.8568178719187166, 'eval_auc': 0.9717061636459475, 'eval_runtime': 26.3301, 'eval_samples_per_second': 156.323, 'eval_steps_per_second': 19.559, 'epoch': 5.0}


Model Compression

Quantization

In [None]:
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from torch.quantization import quantize_dynamic
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification
import copy

In [None]:
def compute_metrics_np(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    rec = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)

    try:
        auc = roc_auc_score(y_true, pd.get_dummies(y_pred).values, average="weighted", multi_class="ovr")
    except Exception:
        auc = float("nan")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}

distilbert_cpu = final_distilbert.to("cpu").eval()
quantized_distilbert = quantize_dynamic(distilbert_cpu, {nn.Linear}, dtype=torch.qint8)
print("DistilBERT quantization complete.")
print("Quantized DistilBERT size:", sum(p.numel() for p in quantized_distilbert.parameters()))

roberta_cpu = final_roberta.to("cpu").eval()
quantized_roberta = quantize_dynamic(roberta_cpu, {nn.Linear}, dtype=torch.qint8)
print("RoBERTa quantization complete.")
print("Quantized RoBERTa size:", sum(p.numel() for p in quantized_roberta.parameters()))

def eval_model_cpu(model, val_dataset):
    model.eval()
    preds, gold = [], []

    from torch.utils.data import DataLoader
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]

            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            preds.extend(torch.argmax(logits, dim=1).numpy().tolist())
            gold.extend(labels.numpy().tolist())

    return compute_metrics_np(gold, preds)

quantized_distilbert_metrics = eval_model_cpu(quantized_distilbert, val_ds_distilbert)
quantized_roberta_metrics = eval_model_cpu(quantized_roberta, val_ds_roberta)

print("Quantized DistilBERT metrics:", quantized_distilbert_metrics)
print("Quantized RoBERTa metrics:", quantized_roberta_metrics)

DistilBERT quantization complete.
Quantized DistilBERT size: 23854080
RoBERTa quantization complete.
Quantized RoBERTa size: 39037440
Quantized DistilBERT metrics: {'accuracy': 0.8782798833819242, 'precision': 0.8792930250595514, 'recall': 0.8782798833819242, 'f1': 0.8782740698741696, 'auc': np.float64(0.9221471625164832)}
Quantized RoBERTa metrics: {'accuracy': 0.8534985422740525, 'precision': 0.8537252994102956, 'recall': 0.8534985422740525, 'f1': 0.8533234362337822, 'auc': np.float64(0.9065602557377263)}


Pruning

In [None]:
pruned_distilbert = copy.deepcopy(final_distilbert).to(DEVICE).eval()

distilbert_to_prune = []
for name, module in pruned_distilbert.named_modules():
    if isinstance(module, nn.Linear):
        distilbert_to_prune.append((module, 'weight'))

prune.global_unstructured(
    distilbert_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.3
)

for module, pname in distilbert_to_prune:
    prune.remove(module, pname)

pruned_roberta = copy.deepcopy(final_roberta).to(DEVICE).eval()

roberta_to_prune = []
for name, module in pruned_roberta.named_modules():
    if isinstance(module, nn.Linear):
        roberta_to_prune.append((module, 'weight'))

prune.global_unstructured(
    roberta_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.3
)

for module, pname in roberta_to_prune:
    prune.remove(module, pname)

def eval_model_device(model, val_dataset):
    model.eval()
    preds, gold = [], []

    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy().tolist())
            gold.extend(labels.cpu().numpy().tolist())

    return compute_metrics_np(gold, preds)

pruned_distilbert_metrics = eval_model_device(pruned_distilbert, val_ds_distilbert)
pruned_roberta_metrics = eval_model_device(pruned_roberta, val_ds_roberta)

print("Pruned DistilBERT metrics:", pruned_distilbert_metrics)
print("Pruned RoBERTa metrics:", pruned_roberta_metrics)

Pruned DistilBERT metrics: {'accuracy': 0.8629737609329446, 'precision': 0.8684890643405709, 'recall': 0.8629737609329446, 'f1': 0.8625078237173148, 'auc': np.float64(0.9142168855554794)}
Pruned RoBERTa metrics: {'accuracy': 0.8189990281827016, 'precision': 0.8259752149195877, 'recall': 0.8189990281827016, 'f1': 0.8186497462070864, 'auc': np.float64(0.8851561779569824)}


Knowledge Distillation

In [None]:
from torch.optim import AdamW

alpha = 0.6
temperature = 3.0
EPOCHS_DISTILL = 4

print("DistilBERT → Even Smaller Student")
teacher_distilbert = final_distilbert.to(DEVICE).eval()
student_tiny = AutoModelForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny", num_labels=5
).to(DEVICE)

print(f"Teacher DistilBERT size: {sum(p.numel() for p in teacher_distilbert.parameters()):,}")
print(f"Student Tiny size: {sum(p.numel() for p in student_tiny.parameters()):,}")

opt_tiny = AdamW(student_tiny.parameters(), lr=3e-5)

for ep in range(1, EPOCHS_DISTILL + 1):
    student_tiny.train()
    running_loss = 0.0

    for batch in DataLoader(train_ds_distilbert, batch_size=16, shuffle=True):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        with torch.no_grad():
            teacher_logits = teacher_distilbert(input_ids=input_ids, attention_mask=attention_mask).logits

        student_output = student_tiny(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        student_logits = student_output.logits
        hard_loss = student_output.loss

        soft_teacher = F.softmax(teacher_logits / temperature, dim=-1)
        log_soft_student = F.log_softmax(student_logits / temperature, dim=-1)
        distill_loss = F.kl_div(log_soft_student, soft_teacher, reduction="batchmean") * (temperature**2)

        loss = alpha * hard_loss + (1 - alpha) * distill_loss

        opt_tiny.zero_grad()
        loss.backward()
        opt_tiny.step()
        running_loss += loss.item()

    print(f"[DistilBERT→Tiny] Epoch {ep}/{EPOCHS_DISTILL} | loss={running_loss/len(DataLoader(train_ds_distilbert, batch_size=16)):.4f}")


print("\nRoBERTa → DistilBERT Student")
teacher_roberta = final_roberta.to(DEVICE).eval()
student_distilbert = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME_DISTILBERT, num_labels=5
).to(DEVICE)

print(f"Teacher RoBERTa size: {sum(p.numel() for p in teacher_roberta.parameters()):,}")
print(f"Student DistilBERT size: {sum(p.numel() for p in student_distilbert.parameters()):,}")

roberta_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_ROBERTA)
distilbert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_DISTILBERT)

opt_distilbert = AdamW(student_distilbert.parameters(), lr=3e-5)

for ep in range(1, EPOCHS_DISTILL + 1):
    student_distilbert.train()
    running_loss = 0.0
    num_batches = 0

    for batch in DataLoader(train_ds_roberta, batch_size=8, shuffle=True):
        try:
            roberta_input_ids = batch["input_ids"].to(DEVICE)
            roberta_attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            with torch.no_grad():
                teacher_logits = teacher_roberta(input_ids=roberta_input_ids, attention_mask=roberta_attention_mask).logits

            input_ids_cpu = batch["input_ids"].cpu()
            texts = []
            for ids in input_ids_cpu:
                text = roberta_tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                texts.append(text)

            distilbert_encoding = distilbert_tokenizer(
                texts, truncation=True, padding=True,
                max_length=512, return_tensors="pt"
            ).to(DEVICE)

            student_output = student_distilbert(
                input_ids=distilbert_encoding["input_ids"],
                attention_mask=distilbert_encoding["attention_mask"],
                labels=labels
            )
            student_logits = student_output.logits
            hard_loss = student_output.loss

            soft_teacher = F.softmax(teacher_logits / temperature, dim=-1)
            log_soft_student = F.log_softmax(student_logits / temperature, dim=-1)
            distill_loss = F.kl_div(log_soft_student, soft_teacher, reduction="batchmean") * (temperature**2)

            loss = alpha * hard_loss + (1 - alpha) * distill_loss

            opt_distilbert.zero_grad()
            loss.backward()
            opt_distilbert.step()
            running_loss += loss.item()
            num_batches += 1

        except Exception as e:
            print(f"Skipping batch due to error: {e}")
            continue

    if num_batches > 0:
        print(f"[RoBERTa→DistilBERT] Epoch {ep}/{EPOCHS_DISTILL} | loss={running_loss/num_batches:.4f}")
    else:
        print(f"[RoBERTa→DistilBERT] Epoch {ep}/{EPOCHS_DISTILL} | No successful batches")


distilled_tiny_metrics = eval_model_device(student_tiny.eval(), val_ds_distilbert)
distilled_distilbert_metrics = eval_model_device(student_distilbert.eval(), val_ds_distilbert)

print("Distilled Tiny (from DistilBERT) metrics:", distilled_tiny_metrics)
print("Distilled DistilBERT (from RoBERTa) metrics:", distilled_distilbert_metrics)

DistilBERT → Even Smaller Student


config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Teacher DistilBERT size: 66,957,317
Student Tiny size: 4,386,565


model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

[DistilBERT→Tiny] Epoch 1/4 | loss=3.6092
[DistilBERT→Tiny] Epoch 2/4 | loss=2.3545
[DistilBERT→Tiny] Epoch 3/4 | loss=1.8993
[DistilBERT→Tiny] Epoch 4/4 | loss=1.6596

RoBERTa → DistilBERT Student


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Teacher RoBERTa size: 124,649,477
Student DistilBERT size: 66,957,317
[RoBERTa→DistilBERT] Epoch 1/4 | loss=1.2581
[RoBERTa→DistilBERT] Epoch 2/4 | loss=0.6048
[RoBERTa→DistilBERT] Epoch 3/4 | loss=0.4446
[RoBERTa→DistilBERT] Epoch 4/4 | loss=0.3445
Distilled Tiny (from DistilBERT) metrics: {'accuracy': 0.761418853255588, 'precision': 0.7657555220294895, 'recall': 0.761418853255588, 'f1': 0.761353191836437, 'auc': np.float64(0.8482655296590735)}
Distilled DistilBERT (from RoBERTa) metrics: {'accuracy': 0.8600583090379009, 'precision': 0.8631345377599475, 'recall': 0.8600583090379009, 'f1': 0.860322797998167, 'auc': np.float64(0.9114143063665232)}
