## Packages & Imports

In [1]:
!pip install transformers seaborn optuna wandb

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.5.0


In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import (
    precision_score, recall_score, f1_score, accuracy_score,
)

import optuna
import wandb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
os.environ["WANDB_PROJECT"]   = "EX4_roberta-tweeter-EDA_fix_param_full_data"
os.environ["WANDB_WATCH"]     = "gradients"
os.environ["WANDB_LOG_MODEL"] = "end"
wandb.login(key="60786c7e57091e6727aabddb0ba2af1cd90b58be")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaarshyovitz[0m ([33msaarshyovitz-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Data

In [4]:
csv_path_train = "train_cleaned.csv"

# Use the Python engine and skip any malformed lines
df_train = pd.read_csv(
    csv_path_train,
    encoding="latin-1",
    engine="python",
    on_bad_lines="skip"    # drop lines with unmatched quotes or other parse errors
)

# Show the first 5 rows
print(df_train.head())

   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  \
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral   
1  advice Talk to your neighbours family to excha...            Positive   
2  Coronavirus Australia: Woolworths to give elde...            Positive   
3  My food stock is not the only one which is emp...            Positive   
4  Me, ready to go at supermarket during the #COV...  Extremely Negative   

                                          clean_text language  
0           @user @user @user http and http and http       en  
1  advice Talk to your neighbours family to excha...       en  
2  Coronavirus Australia: Woolworths to give elde.

In [5]:
# Use the Python engine and skip any malformed lines
csv_path_test = "test_cleaned.csv"
df_test = pd.read_csv(
    csv_path_test,
    encoding="latin-1",
    engine="python",
    on_bad_lines="skip"    # drop lines with unmatched quotes or other parse errors
)

# Show the first 5 rows
print(df_test.head())

   UserName  ScreenName             Location     TweetAt  \
0         1       44953                  NYC  02-03-2020   
1         2       44954          Seattle, WA  02-03-2020   
2         3       44955                  NaN  02-03-2020   
3         4       44956          Chicagoland  02-03-2020   
4         5       44957  Melbourne, Victoria  03-03-2020   

                                       OriginalTweet           Sentiment  \
0  TRENDING: New Yorkers encounter empty supermar...  Extremely Negative   
1  When I couldn't find hand sanitizer at Fred Me...            Positive   
2  Find out how you can protect yourself and love...  Extremely Positive   
3  #Panic buying hits #NewYork City as anxious sh...            Negative   
4  #toiletpaper #dunnypaper #coronavirus #coronav...             Neutral   

                                          clean_text language  
0  TRENDING: New Yorkers encounter empty supermar...       en  
1  When I couldn't find hand sanitizer at Fred Me...  

#### Peek at a single example

In [6]:
from sklearn.model_selection import train_test_split

# Split 10% of df_train into validation set
df_train, df_val = train_test_split(
    df_train,
    test_size=0.1,               # 10% of df_train goes to validation
    random_state=42,             # For reproducibility
    stratify=df_train['Sentiment']   # Preserve class distribution
)

# Save to CSV
df_train.to_csv('train_data.csv', index=False)
df_val.to_csv('val_data.csv', index=False)
df_test.to_csv('test_data.csv', index=False)

print(len(df_train))
print(len(df_val))
print(len(df_test))

29384
3265
3788


## Part 2 – Modeling

### Inspect the base RoBERTa-large architecture

In [7]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

base_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-large", num_labels=5).to(device)
base_model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [8]:
class TweeterDataset(Dataset):
    """Simple Dataset wrapper around a pandas DataFrame."""

    def __init__(self, dataframe: pd.DataFrame, tokenizer):
        self.texts = dataframe["clean_text"].tolist()
        self.labels = dataframe["Sentiment"].tolist()
        self.tokenizer = tokenizer
         # Map label strings → ints *internally* (order is your choice)
        self.label2id = {
            "Extremely Negative": 0,
            "Negative":           1,
            "Neutral":            2,
            "Positive":           3,
            "Extremely Positive": 4,
        }


    def __len__(self):
        return len(self.texts)


    def __getitem__(self, idx):
        text  = self.texts[idx]
        label = self.label2id[self.labels[idx]]     # str ➜ int (0-4)

        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=64,
            return_tensors="pt",
        )
     # squeeze(0) drops the extra batch dim the tokenizer returns
        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels":         torch.tensor(label, dtype=torch.long),
        }

## Train model

### Helper classes & functions

In [None]:
def early_stop_check(patience, best_val_accuracy,best_val_accuracy_epoch, current_val_accuracy,current_val_accuracy_epoch):
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch >patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

In [None]:
def train_model_with_hyperparams(model, train_loader, val_loader,
                                 optimizer, criterion, epochs, patience, trial):
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None

    for epoch in range(1, epochs + 1):
        model.train()  # Enable training mode
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader:  # Iterates over the train_loader
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()  # Reset gradients
            outputs = model(input_ids, attention_mask=attention_mask)  # Forward pass
            logits = outputs.logits  # Save the logits (the raw output of the model)
            loss = criterion(logits, labels)  # Calculate loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights using the optimizer

            # Accumulate training loss and predictions
            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        ### Validation loop ###
        model.eval()  # Enable evaluation mode
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0
        all_val_labels = []
        all_val_preds = []

        with torch.no_grad():  # Disable gradient computation
            for batch in val_loader:  # iterate on the val_loader's batches
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()
                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        # calculate metrics
        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='macro')
        val_recall = recall_score(all_val_labels, all_val_preds, average='macro')
        val_f1 = f1_score(all_val_labels, all_val_preds, average='macro')

        # Check for early stopping
        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
            patience, best_val_accuracy, best_val_accuracy_epoch,
            val_accuracy, epoch
        )

        # Save the best model under the best_model_state parameter
        if val_accuracy == best_val_accuracy:
            best_model_state = model.state_dict()

        # Log metrics to Weights & Biases
        wandb.log({
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1
        })

        if early_stop_flag:
            break  # Exits the training loop immediately if early stopping is triggered

    # Save the best model to file
    if best_model_state is not None:
        torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")


    return best_val_accuracy

In [None]:
import optuna
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import wandb


def objective(trial: optuna.trial.Trial) -> float:
    # -------------------------
    # 1. Hyper-parameter search space
    # -------------------------
    learning_rate = trial.suggest_float("learning_rate", 0.000135, 0.00016, log=True)
    weight_decay  = trial.suggest_float("weight_decay", 0.00013, 0.000155, log=True)
    patience      = trial.suggest_int("patience", 4, 6)
    batch_size    = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    num_layers    = trial.suggest_int("num_layers", 3, 3)  # how many final encoder layers to unfreeze

    train_dataset = TweeterDataset(df_train,
                                 RobertaTokenizer.from_pretrained('roberta-large')) # Create the TweeterDataset object
    val_dataset = TweeterDataset(df_val,
                               RobertaTokenizer.from_pretrained('roberta-large')) # Create the TweeterDataset object

    train_loader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True) # insert into a DataLoader
    val_loader = DataLoader(val_dataset, batch_size=batch_size,shuffle=False) # insert into a DataLoader


    # -------------------------
    # 3. Model initialisation
    # -------------------------
    model = RobertaForSequenceClassification.from_pretrained(
        "roberta-large",num_labels=5,).to(device)

    # Freeze everything …
    for param in model.roberta.parameters():
        param.requires_grad = False
    # … then un-freeze the last `num_layers` encoder blocks …
    for param in model.roberta.encoder.layer[-num_layers:].parameters():
        param.requires_grad = True
    # … and always train the classifier head
    for param in model.classifier.parameters():
        param.requires_grad = True

    # -------------------------
    # 4. Optimiser & loss
    # -------------------------
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate,
                           weight_decay=weight_decay)

    # -------------------------
    # 5. Experiment tracking
    # -------------------------
    wandb.init(
        project="EX4_roberta-tweeter-EDA_fix_param_full_data",
        config={
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers": num_layers,
            "architecture": "RoBERTa",
            "dataset": "tweeter"},
            name=f"trial_{trial.number}") # The name that will be saved inthe W&B platform


    # -------------------------
    # 6. Training loop (your function should implement early-stopping & return best val acc)
    # -------------------------
    best_val_accuracy = train_model_with_hyperparams(
        model,
        train_loader,
        val_loader,
        optimizer,
        criterion,
        epochs=12,
        patience=patience,
        trial=trial)

    # -------------------------
    # 7. Finish logging & return metric to maximise
    # -------------------------
    wandb.finish()
    return best_val_accuracy

In [None]:
# Optuna Study
study = optuna.create_study(direction="maximize") # Specifies that the goal of the optimization is to maximize the objective function
study.optimize(objective, n_trials=8)

[I 2025-08-12 09:30:56,690] A new study created in memory with name: no-name-04288e58-454c-48d9-9e0c-480497fb5a1c


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇█
Train Accuracy,▁▄▄▅▆▆▇▇▇███
Train Loss,█▆▅▄▃▃▂▂▂▁▁▁
Validation Accuracy,▁▄▅▇▇▆██▇███
Validation F1,▁▄▆▇▇▇██▇███
Validation Loss,▅▃▂▁▁▃▂▅█▆██
Validation Precision,▁▃▅▆▇▆████▆█
Validation Recall,▁▄▅▇▆▆▇▆▇▆█▇

0,1
Epoch,12.0
Train Accuracy,0.91645
Train Loss,0.23026
Validation Accuracy,0.6928
Validation F1,0.69992
Validation Loss,1.13954
Validation Precision,0.71886
Validation Recall,0.68921


[I 2025-08-12 09:52:37,697] Trial 0 finished with value: 0.6940275650842267 and parameters: {'learning_rate': 0.00015905115167279662, 'weight_decay': 0.0001391784747773659, 'patience': 4, 'batch_size': 128, 'num_layers': 3}. Best is trial 0 with value: 0.6940275650842267.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇█
Train Accuracy,▁▃▄▅▆▆▇▇▇███
Train Loss,█▆▅▄▄▃▂▂▂▁▁▁
Validation Accuracy,▁▂▅▆█▆▇█▇▇██
Validation F1,▁▂▅▆█▆▇█▇▇██
Validation Loss,▄▃▁▁▁▂▄▅▅▆▇█
Validation Precision,▁▃▅▆▇▆██▇▇██
Validation Recall,▁▃▅▇█▇▆▇▆▇██

0,1
Epoch,12.0
Train Accuracy,0.93204
Train Loss,0.18929
Validation Accuracy,0.69005
Validation F1,0.69902
Validation Loss,1.25244
Validation Precision,0.70938
Validation Recall,0.69542


[I 2025-08-12 10:13:26,077] Trial 1 finished with value: 0.6915773353751914 and parameters: {'learning_rate': 0.0001426427217317309, 'weight_decay': 0.000142441682546486, 'patience': 5, 'batch_size': 256, 'num_layers': 3}. Best is trial 0 with value: 0.6940275650842267.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇█
Train Accuracy,▁▄▅▅▆▆▆▇▇▇██
Train Loss,█▅▅▄▄▃▃▂▂▂▁▁
Validation Accuracy,▂▁▅▅▆▆▇▇▇▇██
Validation F1,▂▁▅▆▆▇▇▇████
Validation Loss,▇█▄▃▂▂▂▂▁▁▂▂
Validation Precision,▁▁▄▅▆▅▅▆▇▇▇█
Validation Recall,▁▁▅▅▆▇▇█▇███

0,1
Epoch,12.0
Train Accuracy,0.81514
Train Loss,0.51086
Validation Accuracy,0.66983
Validation F1,0.6786
Validation Loss,0.94533
Validation Precision,0.69416
Validation Recall,0.67013


[I 2025-08-12 10:37:25,810] Trial 2 finished with value: 0.6698315467075038 and parameters: {'learning_rate': 0.00015658833550671436, 'weight_decay': 0.00014088542216155703, 'patience': 4, 'batch_size': 32, 'num_layers': 3}. Best is trial 0 with value: 0.6940275650842267.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇█
Train Accuracy,▁▃▄▅▆▆▇▇▇███
Train Loss,█▆▅▄▄▃▃▂▂▂▁▁
Validation Accuracy,▁▃▄▅▇▇▆█▇█▇█
Validation F1,▁▃▄▅▆▇▆▇▇█▆▇
Validation Loss,▅▃▂▃▁▃▅▅▆▆▇█
Validation Precision,▁▃▂▃▆▅▄█▆▇▆█
Validation Recall,▁▄▆█▇▇█▆▇█▇▇

0,1
Epoch,12.0
Train Accuracy,0.90866
Train Loss,0.25204
Validation Accuracy,0.6827
Validation F1,0.68848
Validation Loss,1.19015
Validation Precision,0.73043
Validation Recall,0.66884


[I 2025-08-12 11:00:08,506] Trial 3 finished with value: 0.6875957120980092 and parameters: {'learning_rate': 0.00015754199674523995, 'weight_decay': 0.0001465729806229313, 'patience': 4, 'batch_size': 64, 'num_layers': 3}. Best is trial 0 with value: 0.6940275650842267.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇█
Train Accuracy,▁▃▄▅▆▆▇▇▇███
Train Loss,█▆▅▄▄▃▂▂▂▁▁▁
Validation Accuracy,▁▅▇▇█▇▇██▇██
Validation F1,▁▅▇▇█▇▇██▇██
Validation Loss,▆▂▁▂▁▃▃▄▄▆▆█
Validation Precision,▁▇▇▇█▇█▇█▇█▇
Validation Recall,▁▃▆▇▇▇▆▇██▇█

0,1
Epoch,12.0
Train Accuracy,0.91264
Train Loss,0.24468
Validation Accuracy,0.68484
Validation F1,0.69692
Validation Loss,1.24072
Validation Precision,0.69413
Validation Recall,0.70468


[I 2025-08-12 11:22:52,582] Trial 4 finished with value: 0.6977029096477795 and parameters: {'learning_rate': 0.000150074556786172, 'weight_decay': 0.00014199183916444673, 'patience': 5, 'batch_size': 64, 'num_layers': 3}. Best is trial 4 with value: 0.6977029096477795.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇█
Train Accuracy,▁▃▄▅▆▆▇▇▇███
Train Loss,█▆▅▄▃▃▂▂▂▁▁▁
Validation Accuracy,▁▄▆▇▇▇▇█████
Validation F1,▁▅▆▇▇███████
Validation Loss,▄▂▁▁▂▂▂▅▆▇██
Validation Precision,▁▄▃▆▆▇▇██▇▆█
Validation Recall,▁▄▇▇▇███▇██▇

0,1
Epoch,12.0
Train Accuracy,0.92969
Train Loss,0.1948
Validation Accuracy,0.68943
Validation F1,0.69813
Validation Loss,1.19757
Validation Precision,0.71596
Validation Recall,0.68861


[I 2025-08-12 11:43:39,487] Trial 5 finished with value: 0.6949464012251149 and parameters: {'learning_rate': 0.00015667003834770346, 'weight_decay': 0.00013315835671413596, 'patience': 4, 'batch_size': 256, 'num_layers': 3}. Best is trial 4 with value: 0.6977029096477795.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇█
Train Accuracy,▁▃▄▅▆▆▇▇▇███
Train Loss,█▆▅▄▄▃▃▂▂▁▁▁
Validation Accuracy,▁▅▆▆▇▇███▇██
Validation F1,▁▅▆▅▇▇███▇██
Validation Loss,▆▂▁▂▁▂▁▄▄█▅▇
Validation Precision,▁▄▅█▇▇▇█▇█▇█
Validation Recall,▁▅▆▄▇▇█▇█▇█▇

0,1
Epoch,12.0
Train Accuracy,0.89617
Train Loss,0.29439
Validation Accuracy,0.67381
Validation F1,0.68546
Validation Loss,1.17987
Validation Precision,0.70681
Validation Recall,0.6743


[I 2025-08-12 12:07:40,536] Trial 6 finished with value: 0.6796324655436448 and parameters: {'learning_rate': 0.00014431863914520472, 'weight_decay': 0.00015344462793167031, 'patience': 6, 'batch_size': 32, 'num_layers': 3}. Best is trial 4 with value: 0.6977029096477795.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇█
Train Accuracy,▁▃▄▅▆▆▇▇▇███
Train Loss,█▆▅▄▃▃▂▂▂▁▁▁
Validation Accuracy,▁▃▆▇▇▇▇█████
Validation F1,▁▃▆▇▇▇▇█████
Validation Loss,▄▃▂▁▃▃▄▅▇▆▇█
Validation Precision,▁▂▅▇▇▇▆▇▇█▇▇
Validation Recall,▁▄▆▆▇▇▇█▇██▇

0,1
Epoch,12.0
Train Accuracy,0.92319
Train Loss,0.21272
Validation Accuracy,0.68484
Validation F1,0.69224
Validation Loss,1.21411
Validation Precision,0.70523
Validation Recall,0.68371


[I 2025-08-12 12:29:23,002] Trial 7 finished with value: 0.6860643185298622 and parameters: {'learning_rate': 0.0001509364419610655, 'weight_decay': 0.00013298033216587352, 'patience': 4, 'batch_size': 128, 'num_layers': 3}. Best is trial 4 with value: 0.6977029096477795.


## Test the model with checkpoint

In [9]:
# Function to evaluate the model
def evaluate_model(model_path, test_loader):
    # Load the model
    model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=5)
    model.load_state_dict(torch.load(model_path))  # loading the trained model
    model = model.to(device)
    model.eval()  # eval mode

    all_labels = []
    all_preds = []

    # same idea... just testing and getting results...
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = logits.argmax(dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    # Calculate metrics
    accuracy  = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    recall    = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    f1        = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}


# Load the test data set
test_dataset = TweeterDataset(df_test, RobertaTokenizer.from_pretrained('roberta-large'))
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Test multiple models
model_paths = ["best_model_trial_0.pt"]  # Replace with actual model paths
for model_path in model_paths:
    metrics = evaluate_model(model_path, test_loader)
    print(f"Metrics for {model_path}:")
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Metrics for best_model_trial_0.pt:
Accuracy: 0.6542
Precision: 0.6704
Recall: 0.6542
F1 Score: 0.6555


## Hyper-parameter search

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best trial:", study.best_trial.number)
print("Best validation accuracy:", study.best_value)
print("Best params:", study.best_params)
