## Roberta fine-tuning + hyperparameter tuning

#### Setup

In [1]:
!pip install -q optuna
!pip install -q evaluate
!pip install -q emoji==0.6.0

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [2]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torch import nn
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainerCallback,
    EarlyStoppingCallback
)
import evaluate
import torch.nn.functional as F
import optuna
import wandb

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
wandb.login(key="<wandb key>")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmayachn3[0m ([33mmayachn3-maya-bondar[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Data Preparation
### Load Dataset

In [6]:
train = pd.read_csv("OOT_train.csv", encoding='latin-1')
val = pd.read_csv("OOT_val.csv", encoding='latin-1')
test = pd.read_csv("OOT_test.csv", encoding='latin-1')


In [7]:
# train = train.head(1000)
# val = val.head(1000)
# test = test.head(1000)

### Preprocessing

In [8]:
#encoding the labels numerically from Sentiment

ordinal_mapping = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

# map to ordinal labels
train["ordinal_label_id"] = train["Sentiment"].map(ordinal_mapping)
val["ordinal_label_id"] = val["Sentiment"].map(ordinal_mapping)
test["ordinal_label_id"] = test["Sentiment"].map(ordinal_mapping)



In [9]:
# Concat the relevant columns into one string with seperation.
# for example: "Tweet: my food stock is low | Location: Canada | Date: 2020-03-17 | URL: https://t.co/abcd"

# Function to build the input string from multiple columns
def build_augmented_input(row):
    parts = []

    if pd.notna(row.get('clean_tweet')):
        parts.append(f"{row['clean_tweet']}")

    if pd.notna(row.get('Location_standardized')) and row['Location_standardized'].lower() != 'unknown':
        parts.append(f"{row['Location_standardized']}")

    if pd.notna(row.get('TweetAt')):
        parts.append(f"{row['TweetAt']}")


    return " | ".join(parts)

# Apply to the DataFrames
train['model_input'] = train.apply(build_augmented_input, axis=1)
val['model_input'] = val.apply(build_augmented_input, axis=1)
test['model_input'] = test.apply(build_augmented_input, axis=1)

# Create  new DataFrames with only what's needed for modeling
formatted_train = train[['model_input', 'ordinal_label_id']].copy()
formatted_val = val[['model_input', 'ordinal_label_id']].copy()
formatted_test = test[['model_input', 'ordinal_label_id']].copy()



In [10]:
def balance_dataset(df, target_samples_per_class=5000):
    """Balance dataset by undersampling"""
    balanced_dfs = []

    print("Original class distribution:")
    print(df['ordinal_label_id'].value_counts().sort_index())

    for class_id in range(5):
        class_data = df[df['ordinal_label_id'] == class_id]

        if len(class_data) > target_samples_per_class:
            class_data = class_data.sample(n=target_samples_per_class, random_state=42)
            print(f"Class {class_id}: {len(class_data)} samples (undersampled)")
        else:
            print(f"Class {class_id}: {len(class_data)} samples (kept all)")

        balanced_dfs.append(class_data)

    balanced_df = pd.concat(balanced_dfs, ignore_index=True).sample(frac=1, random_state=42)

    print(f"Balanced dataset: {len(balanced_df)} total samples")
    print("New distribution:")
    print(balanced_df['ordinal_label_id'].value_counts().sort_index())

    return balanced_df

# Apply balancing to training data
formatted_train = balance_dataset(formatted_train, target_samples_per_class=5000)

Original class distribution:
ordinal_label_id
0     5175
1     9230
2     6784
3    10140
4     5845
Name: count, dtype: int64
Class 0: 5000 samples (undersampled)
Class 1: 5000 samples (undersampled)
Class 2: 5000 samples (undersampled)
Class 3: 5000 samples (undersampled)
Class 4: 5000 samples (undersampled)
Balanced dataset: 25000 total samples
New distribution:
ordinal_label_id
0    5000
1    5000
2    5000
3    5000
4    5000
Name: count, dtype: int64


### Tokenization

## Model
### Define Model

In [11]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_data(data, max_length=128):
    return tokenizer(
        data['model_input'].tolist(),
        truncation=True,
        padding=False,
        max_length=max_length,
        add_special_tokens=True,
        return_attention_mask=True,
        return_token_type_ids=False
    )

train_encodings = tokenize_data(formatted_train)
val_encodings = tokenize_data(formatted_val)
test_encodings = tokenize_data(formatted_test)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [12]:
## define a PyTorch Dataset
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels  # Should be integers

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # For training
        return item

    def __len__(self):
        return len(self.labels)

# Convert labels to integers if not already
train_labels = formatted_train['ordinal_label_id'].tolist()
val_labels = formatted_val['ordinal_label_id'].tolist()
test_labels = formatted_test['ordinal_label_id'].tolist()


train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
test_dataset = TweetDataset(test_encodings, test_labels)


In [13]:

# define mapping between label id and sentiment for later use and conveniency
ordinal_label2id = ordinal_mapping
ordinal_id2label = {v: k for k, v in ordinal_mapping.items()}


In [14]:
def compute_detailed_metrics(eval_pred):
    """Enhanced metrics using HuggingFace Evaluate library"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Load HuggingFace metrics (cached after first load)
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    # Compute standard classification metrics
    results = {}

    # Basic metrics
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average='macro'))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average='weighted'))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average='macro'))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average='macro'))

    # Per-class F1 scores (HF doesn't have this built-in, so keep custom)
    f1_per_class = f1_score(labels, predictions, average=None)
    for i, class_name in enumerate(['extremely_negative', 'negative', 'neutral', 'positive', 'extremely_positive']):
        results[f'f1_{class_name}'] = f1_per_class[i]

        # Per-class precision and recall
        precision_per_class = precision_score(labels, predictions, average=None, zero_division=0)
        recall_per_class = recall_score(labels, predictions, average=None, zero_division=0)
        results[f'precision_{class_name}'] = precision_per_class[i]
        results[f'recall_{class_name}'] = recall_per_class[i]

        # Per-class accuracy
        class_mask = (labels == i)
        if class_mask.sum() > 0:
            results[f'accuracy_{class_name}'] = accuracy_score(labels[class_mask], predictions[class_mask])
        else:
            results[f'accuracy_{class_name}'] = 0.0

    # Custom ordinal metrics (HF doesn't have these)
    results['mae'] = np.mean(np.abs(predictions - labels))
    results['adjacent_accuracy'] = np.sum(np.abs(predictions - labels) <= 1) / len(labels)

    # Quadratic Weighted Kappa (custom)
    from sklearn.metrics import cohen_kappa_score
    try:
        qwk = cohen_kappa_score(labels, predictions, weights='quadratic')
        results['quadratic_weighted_kappa'] = qwk
    except:
        results['quadratic_weighted_kappa'] = 0.0

    return results

In [15]:
def find_optimal_batch_size(base_batch_size):
    """Find the largest batch size that fits in GPU memory"""
    if device.type == "cpu":
        return base_batch_size

    # Try larger batch sizes for GPU
    for multiplier in [4, 3, 2, 1]:
        try_batch_size = base_batch_size * multiplier
        try:
            # Test if this batch size fits
            dummy_input = torch.randn(try_batch_size, 128, 768, device=device)
            dummy_output = torch.randn(try_batch_size, 5, device=device)
            del dummy_input, dummy_output
            torch.cuda.empty_cache() if device.type == "cuda" else None
            return try_batch_size
        except RuntimeError:  # Out of memory
            continue
    return base_batch_size

In [16]:
class SimpleMetricsLogger(TrainerCallback):
    """Simple callback to log detailed metrics every epoch"""

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None or not wandb.run:
            return

        # Only log when we have evaluation metrics (after each epoch)
        if 'eval_loss' in logs:
            current_epoch = int(state.epoch)

            # Get current learning rate
            current_lr = args.learning_rate
            if 'learning_rate' in logs:
                current_lr = logs['learning_rate']

            #Get training loss from state history
            train_loss = 0
            if state.log_history:
                # Find the most recent training loss
                for log_entry in reversed(state.log_history):
                    if 'train_loss' in log_entry:
                        train_loss = log_entry['train_loss']
                        break

            detailed_metrics = {
                "Epoch": current_epoch,
                "Stage": 1,
                "Unfrozen_Layers": 12,
                "Train Loss": train_loss,
                "Train Accuracy": 0,  # Usually not computed during training
                "Validation Loss": logs.get('eval_loss', 0),
                "Validation Accuracy": logs.get('eval_accuracy', 0),
                "Validation Precision": logs.get('eval_precision_macro', 0),
                "Validation Recall": logs.get('eval_recall_macro', 0),
                "Validation F1": logs.get('eval_f1_macro', 0),
                "Validation MAE": logs.get('eval_mae', 0),
                "Validation Adjacent Accuracy": logs.get('eval_adjacent_accuracy', 0),
                "Validation QWK": logs.get('eval_quadratic_weighted_kappa', 0),
                "Learning_Rate": current_lr,
            }

            # Log to WandB
            wandb.log(detailed_metrics)

            #Print progress to console
            print(f"Epoch {current_epoch}: "
                  f"Train Loss: {train_loss:.4f}, "  # ← Now shows real values
                  f"Val Loss: {logs.get('eval_loss', 0):.4f}, "
                  f"Val F1: {logs.get('eval_f1_macro', 0):.4f}, "
                  f"QWK: {logs.get('eval_quadratic_weighted_kappa', 0):.4f}")

In [17]:
def save_training_checkpoint(model, optimizer, epoch, loss, trial_params, filepath, trial_number, current_score, trainer):
    """Save complete training checkpoint and handle best model updates"""
    global best_score, best_model_path

    # Get the trial directory from filepath
    trial_dir = os.path.dirname(filepath)

    # Save trial checkpoint
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        # 'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'trial_params': trial_params,
        'model_config': model.config.to_dict(),
        'tokenizer_name': model_name,
        'current_score': current_score,  # Add score to checkpoint
        'trial_number': trial_number,
    }

    # Save all trial files in the same directory
    torch.save(checkpoint, filepath)
    torch.save(model.state_dict(), os.path.join(trial_dir, 'model_roberta_weights.pt'))
    torch.save(model, os.path.join(trial_dir, 'model_roberta.pt'))

    print(f"✅ Trial checkpoint saved: {filepath}")
    print(f"✅ Model files saved in: {trial_dir}")

    # Update best model if needed
    if current_score > best_score:
        best_score = current_score

        # Save HuggingFace format to best model directory
        trainer.save_model(best_model_path)

        # Also save our custom format in best model directory
        os.makedirs(best_model_path, exist_ok=True)
        best_checkpoint_path = os.path.join(best_model_path, 'best_checkpoint.ckpt')
        best_weights_path = os.path.join(best_model_path, 'model_roberta_weights.pt')
        best_model_file_path = os.path.join(best_model_path, 'model_roberta.pt')

        torch.save(checkpoint, best_checkpoint_path)
        torch.save(model.state_dict(), best_weights_path)
        torch.save(model, best_model_file_path)

        print(f"🏆 New best model saved! Score: {current_score:.4f} (Trial {trial_number})")
        print(f"🏆 Best model files saved in: {best_model_path}")

        # Optional: Log to W&B
        # wandb.log({
        #     "best_score_so_far": current_score,
        #     "best_trial_number": trial_number,
        # })
    else:
        print(f"📊 Trial {trial_number} score: {current_score:.4f} (Best: {best_score:.4f})")

In [18]:
# Global variables to track best model
best_score = 0.0
best_model_path = "./best_roberta_model_so_far"

### Hyperparameter Tuning

In [19]:
def objective(trial):
    """Clean, organized objective function for Optuna hyperparameter optimization"""
    global best_score, best_model_path

    # === GPU MEMORY CLEANUP ===
    if device.type == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

    # === HYPERPARAMETER SAMPLING ===
    # Core training parameters
    learning_rate = trial.suggest_float("learning_rate", 3e-5, 5e-4, log=True)
    base_batch_size = trial.suggest_categorical("batch_size", [16,32,64])
    label_smoothing = trial.suggest_float("label_smoothing", 0.05, 0.15)
    num_epochs = trial.suggest_int("num_epochs", 10, 15)

    # Advanced optimization parameters
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.05, 0.15)
    weight_decay = trial.suggest_float("weight_decay", 0.05, 0.15)

    # Model architecture parameters
    attention_dropout = trial.suggest_float("attention_dropout", 0.3, 0.4)
    hidden_dropout = trial.suggest_float("hidden_dropout", 0.3, 0.4)

    #R-drop parameter
    # rdrop_alpha = trial.suggest_float("rdrop_alpha", 0.0, 1.0)

    # PRINT CHOSEN PARAMETERS
    print(f"TRIAL {trial.number} - TESTING THESE PARAMETERS:")
    print(f"Learning Rate:      {learning_rate:.2e}")
    print(f"Epochs:             {num_epochs}")
    print(f"Warmup Ratio:       {warmup_ratio:.3f}")
    print(f"Weight Decay:       {weight_decay:.3f}")
    print(f"Attention Dropout:  {attention_dropout:.3f}")
    print(f"Hidden Dropout:     {hidden_dropout:.3f}")


    # Optimize batch size for available hardware
    batch_size = find_optimal_batch_size(base_batch_size)


    # === EXPERIMENT TRACKING SETUP ===
    wandb.init(
        project="covid-tweet-sentiment-hf-roberta-regularloss",
        name=f"trial_{trial.number}",
        config={
            "learning_rate": learning_rate,
            "batch_size": batch_size,
            "num_epochs": num_epochs,
            "warmup_ratio": warmup_ratio,
            "weight_decay": weight_decay,
            "attention_dropout": attention_dropout,
            "hidden_dropout": hidden_dropout
            # "rdrop_alpha": rdrop_alpha ,
        },
        reinit=True
    )

    try:
        # === MODEL SETUP ===
        model = _setup_model(attention_dropout, hidden_dropout)

        # === TRAINING CONFIGURATION ===
        training_args = _create_training_args(
            trial_number=trial.number,
            learning_rate=learning_rate,
            batch_size=batch_size,
            num_epochs=num_epochs,
            warmup_ratio=warmup_ratio,
            weight_decay=weight_decay,
            label_smoothing_factor=label_smoothing
        )

        trial_params = {
            "learning_rate": learning_rate,
            "batch_size": batch_size,
            "num_epochs": num_epochs,
            "warmup_ratio": warmup_ratio,
            "weight_decay": weight_decay,
            "attention_dropout": attention_dropout,
            "hidden_dropout": hidden_dropout
        }

        trainer = _create_trainer(model, training_args, trial.number, trial_params)
        trainer.train()

        # Set Checkpoint per trial
        checkpoint_dir = f"./checkpoints_roberta/trial_{trial.number}"
        os.makedirs(checkpoint_dir, exist_ok=True)

        final_epoch = int(trainer.state.epoch)
        checkpoint_path = f"{checkpoint_dir}/final_epoch_{final_epoch}.ckpt"

        # Get final training loss
        final_loss = 0
        if trainer.state.log_history:
            for log_entry in reversed(trainer.state.log_history):
                if 'train_loss' in log_entry:
                    final_loss = log_entry['train_loss']
                    break

        eval_results = trainer.evaluate()
        current_score = eval_results["eval_quadratic_weighted_kappa"]

        save_training_checkpoint(
            model=trainer.model,
            optimizer=None,
            epoch=final_epoch,
            loss=final_loss,
            trial_params=trial_params,
            filepath=checkpoint_path,
            trial_number=trial.number,
            current_score=current_score,
            trainer=trainer
        )

        # Log GPU usage if available
        if device.type == "cuda":
            print(f"GPU Memory Used: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")

        return current_score

    except Exception as e:
        print(f"Trial {trial.number} failed: {e}")
        raise optuna.exceptions.TrialPruned()

    finally:
        # === CLEANUP ===
        if device.type == "cuda":
            torch.cuda.empty_cache()
        wandb.finish()


def _setup_model(attention_dropout, hidden_dropout):
    """Setup and configure the model with dropout and freezing"""
    model = AutoModelForSequenceClassification.from_pretrained(
        "cardiffnlp/twitter-roberta-base-sentiment",
        num_labels=5,
        id2label=ordinal_id2label,
        label2id=ordinal_label2id,
        ignore_mismatched_sizes=True
    )

    # Apply dropout configuration
    model.config.attention_probs_dropout_prob = attention_dropout
    model.config.hidden_dropout_prob = hidden_dropout

    # GPU optimizations
    if device.type == "cuda":
        model.gradient_checkpointing_enable()

        model.to(device)

    return model


def _create_training_args(trial_number, learning_rate, batch_size, num_epochs, warmup_ratio, weight_decay, label_smoothing_factor=0.1):
    """Create optimized training arguments"""
    return TrainingArguments(
        # output_dir=f"./results/trial_{trial_number}",

        # Core training parameters
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,  # Larger eval batch
        learning_rate=learning_rate,
        label_smoothing_factor=label_smoothing_factor,

        # Learning rate scheduling
        lr_scheduler_type="cosine",
        warmup_ratio=warmup_ratio,

        # Optimization
        optim="adamw_torch",
        weight_decay=weight_decay,
        max_grad_norm=1.0,

        # Evaluation and saving
        eval_strategy="epoch",
        save_strategy="no",
        save_total_limit=1,
        # load_best_model_at_end=True,
        metric_for_best_model="eval_quadratic_weighted_kappa",
        greater_is_better=True,

        # Performance optimizations
        fp16=device.type == "cuda",
        # tf32=device.type == "cuda",
        dataloader_pin_memory=True,
        dataloader_persistent_workers=device.type == "cuda",
        dataloader_num_workers=2 if device.type == "cuda" else 0,
        dataloader_drop_last=False,
        group_by_length=True,
        gradient_accumulation_steps=1,
        dataloader_prefetch_factor=2 if device.type == "cuda" else None,

        # Logging
        logging_steps=100,
        report_to="wandb",
        remove_unused_columns=False,

        # Evaluation optimizations
        eval_accumulation_steps=None,
        prediction_loss_only=False,

    )

def _create_trainer(model, training_args, trial_number, trial_params):
    """ordinal loss trainer"""

    callbacks = [
        EarlyStoppingCallback(early_stopping_patience=2),
        SimpleMetricsLogger(),
        ]

    return Trainer(  # Uses ordinal loss always
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_detailed_metrics,
        tokenizer=tokenizer,
        callbacks=callbacks,
    )

In [20]:
from datetime import datetime
import json

def save_best_hyperparameters(study, model_name="roberta"):
    """Save the best hyperparameters found by Optuna"""

    # Check if any trial completed successfully
    completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if not completed_trials:
        print("No completed trials found. Skipping saving best hyperparameters.")
        return None

    best_params = study.best_params
    best_score = study.best_value

    # Save to JSON file
    results = {
        "model_name": model_name,
        "best_score": best_score,
        "best_params": best_params,
        "timestamp": str(datetime.now()),
        "total_trials": len(study.trials),
        "completed_trials": len(completed_trials)
    }

    filename = f"best_params_{model_name}.json"
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"Best hyperparameters saved to {filename}")
    print(f"Best score: {best_score:.4f}")
    print(f"Best params: {best_params}")
    print(f"Completed {len(completed_trials)}/{len(study.trials)} trials")

    return results

## Training

In [21]:
# Run optimization

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=8)

roberta_results = save_best_hyperparameters(study, "roberta")


[I 2025-08-01 12:44:54,324] A new study created in memory with name: no-name-003c7642-7dcd-46d2-b3e5-e4293adf27cb


TRIAL 0 - TESTING THESE PARAMETERS:
Learning Rate:      2.92e-04
Epochs:             14
Warmup Ratio:       0.073
Weight Decay:       0.066
Attention Dropout:  0.386
Hidden Dropout:     0.380




pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,No log,1.004826,0.671793,0.661955,0.671601,0.705139,0.675851,0.558981,0.854508,0.854508,0.52075,0.671848,0.425137,0.425137,0.805284,0.82277,0.788526,0.788526,0.607835,0.673664,0.553725,0.553725,0.742976,0.630742,0.903797,0.903797,0.417489,0.924719,0.828528
2,1.124800,0.821617,0.767501,0.766345,0.760432,0.790638,0.756432,0.650442,0.903689,0.903689,0.671858,0.684032,0.660109,0.660109,0.842105,0.85681,0.827897,0.827897,0.743986,0.822412,0.679216,0.679216,0.832736,0.788462,0.882278,0.882278,0.291026,0.94767,0.879699
3,0.853100,0.892344,0.756025,0.757975,0.764463,0.773499,0.730829,0.591139,0.956967,0.956967,0.634119,0.651672,0.617486,0.617486,0.852154,0.882992,0.823397,0.823397,0.76198,0.763179,0.760784,0.760784,0.805755,0.933333,0.708861,0.708861,0.313978,0.93826,0.863103
4,0.690600,0.796403,0.768648,0.769362,0.786432,0.769524,0.775079,0.799564,0.752049,0.752049,0.697517,0.72112,0.67541,0.67541,0.838226,0.942416,0.754781,0.754781,0.736725,0.699577,0.778039,0.778039,0.824221,0.769484,0.887342,0.887342,0.294469,0.940555,0.870426


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Epoch 1: Train Loss: 0.0000, Val Loss: 1.0048, Val F1: 0.0000, QWK: 0.8285


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 0.8216, Val F1: 0.0000, QWK: 0.8797


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.8923, Val F1: 0.0000, QWK: 0.8631


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.7964, Val F1: 0.0000, QWK: 0.8704


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.8308, Val Loss: 0.7964, Val F1: 0.0000, QWK: 0.8704
✅ Trial checkpoint saved: ./checkpoints_roberta/trial_0/final_epoch_4.ckpt
✅ Model files saved in: ./checkpoints_roberta/trial_0
🏆 New best model saved! Score: 0.8704 (Trial 0)
🏆 Best model files saved in: ./best_roberta_model_so_far
GPU Memory Used: 4.02 GB


0,1
Epoch,▁▃▆██
Learning_Rate,▁▁▁▁▁
Stage,▁▁▁▁▁
Train Accuracy,▁▁▁▁▁
Train Loss,▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁
Validation Accuracy,▁█▇██
Validation Adjacent Accuracy,▁█▅▆▆
Validation F1,▁▁▁▁▁
Validation Loss,█▂▄▁▁

0,1
Epoch,4.0
Learning_Rate,0.00029
Stage,1.0
Train Accuracy,0.0
Train Loss,0.83078
Unfrozen_Layers,12.0
Validation Accuracy,0.76865
Validation Adjacent Accuracy,0.94056
Validation F1,0.0
Validation Loss,0.7964


[I 2025-08-01 12:50:52,726] Trial 0 finished with value: 0.8704257608882765 and parameters: {'learning_rate': 0.00029176184737646536, 'batch_size': 64, 'label_smoothing': 0.06578337081435587, 'num_epochs': 14, 'warmup_ratio': 0.07333428729811922, 'weight_decay': 0.06627383179240057, 'attention_dropout': 0.38585144580527303, 'hidden_dropout': 0.37965759612471295}. Best is trial 0 with value: 0.8704257608882765.


TRIAL 1 - TESTING THESE PARAMETERS:
Learning Rate:      3.55e-05
Epochs:             15
Warmup Ratio:       0.124
Weight Decay:       0.065
Attention Dropout:  0.366
Hidden Dropout:     0.346


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.495,1.019028,0.614184,0.607446,0.615032,0.64976,0.677338,0.575107,0.82377,0.82377,0.525694,0.571979,0.486339,0.486339,0.685273,0.725786,0.649044,0.649044,0.523261,0.57397,0.480784,0.480784,0.70725,0.628319,0.808861,0.808861,0.50218,0.904062,0.782339
2,0.9186,0.93119,0.69658,0.690178,0.709403,0.730566,0.762906,0.715054,0.817623,0.817623,0.670991,0.728553,0.621858,0.621858,0.808858,0.839178,0.780652,0.780652,0.571429,0.67161,0.497255,0.497255,0.725577,0.592622,0.935443,0.935443,0.373422,0.939178,0.850627
3,0.723,0.745237,0.792288,0.792374,0.805106,0.79572,0.816369,0.836559,0.797131,0.797131,0.738339,0.76987,0.70929,0.70929,0.843731,0.894207,0.79865,0.79865,0.758308,0.731245,0.787451,0.787451,0.837321,0.793651,0.886076,0.886076,0.265779,0.947441,0.88002
4,0.5941,0.696527,0.811568,0.811191,0.811556,0.821788,0.816683,0.775322,0.862705,0.862705,0.762702,0.77968,0.746448,0.746448,0.857817,0.901985,0.817773,0.817773,0.785856,0.796296,0.775686,0.775686,0.852381,0.804494,0.906329,0.906329,0.229057,0.9619,0.906286
5,0.5191,0.7359,0.809961,0.810121,0.821044,0.809716,0.825616,0.795066,0.858607,0.858607,0.766741,0.783352,0.75082,0.75082,0.836139,0.834267,0.83802,0.83802,0.805041,0.763176,0.851765,0.851765,0.829713,0.929356,0.749367,0.749367,0.223089,0.968786,0.908327
6,0.49,0.735637,0.814322,0.813594,0.810248,0.830675,0.817029,0.732143,0.92418,0.92418,0.744476,0.772941,0.718033,0.718033,0.863095,0.916561,0.815523,0.815523,0.800487,0.828715,0.774118,0.774118,0.856975,0.80088,0.921519,0.921519,0.226532,0.963277,0.908134
7,0.4733,0.730182,0.826716,0.827163,0.831501,0.833607,0.849699,0.831373,0.868852,0.868852,0.788421,0.760406,0.818579,0.818579,0.838567,0.821814,0.856018,0.856018,0.811722,0.831414,0.792941,0.792941,0.870199,0.9125,0.831646,0.831646,0.208171,0.968327,0.912203
8,0.3941,0.753481,0.820289,0.819374,0.82598,0.826541,0.839532,0.871965,0.809426,0.809426,0.787198,0.809469,0.76612,0.76612,0.850084,0.848655,0.851519,0.851519,0.792895,0.816972,0.770196,0.770196,0.852364,0.782839,0.935443,0.935443,0.213909,0.969245,0.912207
9,0.3681,0.772264,0.818683,0.817069,0.816747,0.83317,0.841379,0.810247,0.875,0.875,0.783305,0.821343,0.748634,0.748634,0.839827,0.809176,0.872891,0.872891,0.793465,0.851619,0.742745,0.742745,0.853644,0.791351,0.926582,0.926582,0.207941,0.975671,0.920443
10,0.3526,0.753087,0.833601,0.833337,0.833568,0.842389,0.849751,0.825919,0.875,0.875,0.804573,0.801518,0.80765,0.80765,0.844492,0.812046,0.87964,0.87964,0.816476,0.850467,0.785098,0.785098,0.871173,0.877892,0.864557,0.864557,0.191646,0.97613,0.924915


Epoch 1: Train Loss: 0.0000, Val Loss: 1.0190, Val F1: 0.0000, QWK: 0.7823


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 0.9312, Val F1: 0.0000, QWK: 0.8506


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.7452, Val F1: 0.0000, QWK: 0.8800


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.6965, Val F1: 0.0000, QWK: 0.9063


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.7359, Val F1: 0.0000, QWK: 0.9083


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.7356, Val F1: 0.0000, QWK: 0.9081


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.7302, Val F1: 0.0000, QWK: 0.9122


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.7535, Val F1: 0.0000, QWK: 0.9122


  return forward_call(*args, **kwargs)


Epoch 9: Train Loss: 0.0000, Val Loss: 0.7723, Val F1: 0.0000, QWK: 0.9204


  return forward_call(*args, **kwargs)


Epoch 10: Train Loss: 0.0000, Val Loss: 0.7531, Val F1: 0.0000, QWK: 0.9249


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.0000, Val Loss: 0.7964, Val F1: 0.0000, QWK: 0.9228


  return forward_call(*args, **kwargs)


Epoch 12: Train Loss: 0.0000, Val Loss: 0.8045, Val F1: 0.0000, QWK: 0.9249


  return forward_call(*args, **kwargs)


Epoch 13: Train Loss: 0.0000, Val Loss: 0.8293, Val F1: 0.0000, QWK: 0.9235


  return forward_call(*args, **kwargs)


Epoch 14: Train Loss: 0.0000, Val Loss: 0.8253, Val F1: 0.0000, QWK: 0.9252


  return forward_call(*args, **kwargs)


Epoch 15: Train Loss: 0.0000, Val Loss: 0.8226, Val F1: 0.0000, QWK: 0.9262


  return forward_call(*args, **kwargs)


Epoch 15: Train Loss: 0.5074, Val Loss: 0.8226, Val F1: 0.0000, QWK: 0.9262
✅ Trial checkpoint saved: ./checkpoints_roberta/trial_1/final_epoch_15.ckpt
✅ Model files saved in: ./checkpoints_roberta/trial_1
🏆 New best model saved! Score: 0.9262 (Trial 1)
🏆 Best model files saved in: ./best_roberta_model_so_far
GPU Memory Used: 4.02 GB


0,1
Epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Accuracy,▁▄▇▇▇▇██████████
Validation Adjacent Accuracy,▁▄▅▆▇▇▇▇████████
Validation F1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,█▆▂▁▂▂▂▂▃▂▃▃▄▄▄▄

0,1
Epoch,15.0
Learning_Rate,4e-05
Stage,1.0
Train Accuracy,0.0
Train Loss,0.50738
Unfrozen_Layers,12.0
Validation Accuracy,0.83085
Validation Adjacent Accuracy,0.9782
Validation F1,0.0
Validation Loss,0.82258


[I 2025-08-01 13:12:06,956] Trial 1 finished with value: 0.9261703976204811 and parameters: {'learning_rate': 3.551627588729587e-05, 'batch_size': 32, 'label_smoothing': 0.05996684663319153, 'num_epochs': 15, 'warmup_ratio': 0.12350792433050267, 'weight_decay': 0.0648880207916102, 'attention_dropout': 0.3659130679774672, 'hidden_dropout': 0.345793511961371}. Best is trial 1 with value: 0.9261703976204811.


TRIAL 2 - TESTING THESE PARAMETERS:
Learning Rate:      2.41e-04
Epochs:             11
Warmup Ratio:       0.144
Weight Decay:       0.085
Attention Dropout:  0.376
Hidden Dropout:     0.353


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.2458,0.910492,0.739041,0.739824,0.7429,0.750166,0.723906,0.614286,0.881148,0.881148,0.63803,0.670277,0.608743,0.608743,0.80479,0.860435,0.755906,0.755906,0.742597,0.719647,0.767059,0.767059,0.789973,0.849854,0.737975,0.737975,0.322699,0.944687,0.864594
2,0.8823,1.162163,0.568281,0.551918,0.649956,0.621346,0.701571,0.610942,0.82377,0.82377,0.440323,0.84,0.298361,0.298361,0.764398,0.913928,0.656918,0.656918,0.38485,0.432485,0.346667,0.346667,0.619257,0.452423,0.981013,0.981013,0.596052,0.873766,0.740156
3,0.7967,0.90068,0.751205,0.748243,0.75156,0.772085,0.690355,0.534231,0.97541,0.97541,0.580517,0.737374,0.478689,0.478689,0.839822,0.830583,0.849269,0.849269,0.77954,0.788292,0.77098,0.77098,0.824701,0.867318,0.786076,0.786076,0.299977,0.955015,0.882684
4,0.7108,0.89277,0.776681,0.776731,0.770487,0.796445,0.709579,0.566707,0.94877,0.94877,0.627737,0.707819,0.563934,0.563934,0.860849,0.904585,0.821147,0.821147,0.799022,0.831919,0.768627,0.768627,0.860149,0.841404,0.879747,0.879747,0.276566,0.952949,0.888856
5,0.6268,0.821409,0.788616,0.789264,0.78284,0.807738,0.729118,0.588903,0.956967,0.956967,0.664678,0.731932,0.608743,0.608743,0.871252,0.912562,0.833521,0.833521,0.803579,0.834459,0.774902,0.774902,0.855354,0.846344,0.864557,0.864557,0.262796,0.955933,0.892345
6,0.585,0.918941,0.76796,0.764998,0.775232,0.796078,0.798937,0.703588,0.92418,0.92418,0.719451,0.837446,0.630601,0.630601,0.851215,0.899749,0.807649,0.807649,0.707062,0.756708,0.663529,0.663529,0.793267,0.678668,0.95443,0.95443,0.275648,0.960064,0.895882
7,0.5286,0.853806,0.813633,0.812823,0.834094,0.804901,0.785024,0.955882,0.665984,0.665984,0.78446,0.764523,0.805464,0.805464,0.853075,0.863899,0.84252,0.84252,0.791503,0.794002,0.78902,0.78902,0.85196,0.792165,0.921519,0.921519,0.223319,0.966032,0.904974
8,0.4902,0.784292,0.834978,0.834643,0.834736,0.842217,0.84264,0.83501,0.85041,0.85041,0.801959,0.798483,0.805464,0.805464,0.844587,0.806551,0.886389,0.886389,0.821399,0.864069,0.782745,0.782745,0.877743,0.869565,0.886076,0.886076,0.190039,0.977048,0.924997
9,0.4395,0.818025,0.831535,0.831402,0.83259,0.840011,0.845098,0.81015,0.883197,0.883197,0.798191,0.826698,0.771585,0.771585,0.830208,0.773036,0.896513,0.896513,0.823005,0.850921,0.796863,0.796863,0.876302,0.902145,0.851899,0.851899,0.18958,0.981409,0.926923
10,0.4149,0.806962,0.837273,0.837007,0.839453,0.842531,0.850361,0.856549,0.844262,0.844262,0.793687,0.819558,0.769399,0.769399,0.825996,0.773307,0.886389,0.886389,0.836112,0.868243,0.806275,0.806275,0.892768,0.879607,0.906329,0.906329,0.183842,0.98095,0.928797


Epoch 1: Train Loss: 0.0000, Val Loss: 0.9105, Val F1: 0.0000, QWK: 0.8646


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 1.1622, Val F1: 0.0000, QWK: 0.7402


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.9007, Val F1: 0.0000, QWK: 0.8827


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.8928, Val F1: 0.0000, QWK: 0.8889


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.8214, Val F1: 0.0000, QWK: 0.8923


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.9189, Val F1: 0.0000, QWK: 0.8959


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.8538, Val F1: 0.0000, QWK: 0.9050


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.7843, Val F1: 0.0000, QWK: 0.9250


  return forward_call(*args, **kwargs)


Epoch 9: Train Loss: 0.0000, Val Loss: 0.8180, Val F1: 0.0000, QWK: 0.9269


  return forward_call(*args, **kwargs)


Epoch 10: Train Loss: 0.0000, Val Loss: 0.8070, Val F1: 0.0000, QWK: 0.9288


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.0000, Val Loss: 0.8234, Val F1: 0.0000, QWK: 0.9298


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.6301, Val Loss: 0.8234, Val F1: 0.0000, QWK: 0.9298
✅ Trial checkpoint saved: ./checkpoints_roberta/trial_2/final_epoch_11.ckpt
✅ Model files saved in: ./checkpoints_roberta/trial_2
🏆 New best model saved! Score: 0.9298 (Trial 2)
🏆 Best model files saved in: ./best_roberta_model_so_far
GPU Memory Used: 4.02 GB


0,1
Epoch,▁▂▂▃▄▅▅▆▇▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁▁▁▁
Validation Accuracy,▅▁▆▆▇▆▇█████
Validation Adjacent Accuracy,▆▁▆▆▆▇▇█████
Validation F1,▁▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,▃█▃▃▂▃▂▁▂▁▂▂

0,1
Epoch,11.0
Learning_Rate,0.00024
Stage,1.0
Train Accuracy,0.0
Train Loss,0.63012
Unfrozen_Layers,12.0
Validation Accuracy,0.83957
Validation Adjacent Accuracy,0.98072
Validation F1,0.0
Validation Loss,0.82341


[I 2025-08-01 13:27:32,271] Trial 2 finished with value: 0.9297840127669785 and parameters: {'learning_rate': 0.0002410038500292598, 'batch_size': 32, 'label_smoothing': 0.09079127901743848, 'num_epochs': 11, 'warmup_ratio': 0.14435680747215307, 'weight_decay': 0.0849441280799004, 'attention_dropout': 0.3758622805072069, 'hidden_dropout': 0.3533269312284606}. Best is trial 2 with value: 0.9297840127669785.


TRIAL 3 - TESTING THESE PARAMETERS:
Learning Rate:      4.96e-05
Epochs:             14
Warmup Ratio:       0.144
Weight Decay:       0.126
Attention Dropout:  0.314
Hidden Dropout:     0.381


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.4376,1.053591,0.631398,0.626026,0.632779,0.664631,0.673806,0.563361,0.838115,0.838115,0.528369,0.57529,0.488525,0.488525,0.703364,0.770777,0.646794,0.646794,0.560976,0.604714,0.523137,0.523137,0.727577,0.649751,0.826582,0.826582,0.476475,0.909112,0.798025
2,0.9444,1.029444,0.677989,0.671405,0.707834,0.716637,0.771964,0.744762,0.80123,0.80123,0.677204,0.763014,0.608743,0.608743,0.821788,0.8675,0.780652,0.780652,0.516937,0.632955,0.436863,0.436863,0.68264,0.530942,0.955696,0.955696,0.403489,0.932752,0.835351
3,0.7763,0.799111,0.799174,0.799632,0.799421,0.809862,0.808194,0.740614,0.889344,0.889344,0.734914,0.724761,0.745355,0.745355,0.842703,0.850917,0.834646,0.834646,0.786624,0.798707,0.774902,0.774902,0.841827,0.882108,0.805063,0.805063,0.246959,0.957769,0.89528
4,0.6617,0.763968,0.817994,0.817641,0.827168,0.817579,0.814159,0.884615,0.754098,0.754098,0.785408,0.771338,0.8,0.8,0.855688,0.861048,0.850394,0.850394,0.790143,0.800967,0.779608,0.779608,0.858689,0.817869,0.903797,0.903797,0.221712,0.963507,0.904919
5,0.5929,0.797461,0.822814,0.821742,0.828338,0.826413,0.841785,0.833333,0.85041,0.85041,0.76495,0.834625,0.706011,0.706011,0.84637,0.846847,0.845894,0.845894,0.809415,0.7844,0.836078,0.836078,0.867322,0.842482,0.893671,0.893671,0.213679,0.966491,0.910192
6,0.544,0.833591,0.809502,0.808412,0.811861,0.827771,0.836224,0.789091,0.889344,0.889344,0.780712,0.822249,0.743169,0.743169,0.873979,0.907879,0.84252,0.84252,0.763606,0.811837,0.720784,0.720784,0.821842,0.72825,0.943038,0.943038,0.226991,0.96672,0.91073
7,0.5211,0.771057,0.842782,0.843136,0.852011,0.839397,0.843011,0.886878,0.803279,0.803279,0.809829,0.792059,0.828415,0.828415,0.864928,0.892344,0.839145,0.839145,0.832246,0.815038,0.850196,0.850196,0.874842,0.873737,0.875949,0.875949,0.187973,0.97154,0.921069
8,0.4738,0.790825,0.827634,0.826527,0.827476,0.839082,0.84879,0.835317,0.862705,0.862705,0.802903,0.820776,0.785792,0.785792,0.851993,0.828025,0.87739,0.87739,0.796862,0.841325,0.756863,0.756863,0.859356,0.811937,0.912658,0.912658,0.200367,0.974294,0.921085
9,0.4496,0.829321,0.828093,0.827238,0.82666,0.839077,0.846229,0.810507,0.885246,0.885246,0.797267,0.832342,0.765027,0.765027,0.850613,0.843094,0.858268,0.858268,0.805477,0.827815,0.784314,0.784314,0.859036,0.81954,0.902532,0.902532,0.199449,0.974983,0.921875
10,0.4362,0.838869,0.830388,0.829763,0.831453,0.838791,0.852792,0.84507,0.860656,0.860656,0.804262,0.826037,0.783607,0.783607,0.837765,0.809224,0.868391,0.868391,0.811183,0.839061,0.785098,0.785098,0.866055,0.83787,0.896203,0.896203,0.193941,0.978196,0.924295


Epoch 1: Train Loss: 0.0000, Val Loss: 1.0536, Val F1: 0.0000, QWK: 0.7980


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 1.0294, Val F1: 0.0000, QWK: 0.8354


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.7991, Val F1: 0.0000, QWK: 0.8953


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.7640, Val F1: 0.0000, QWK: 0.9049


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.7975, Val F1: 0.0000, QWK: 0.9102


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.8336, Val F1: 0.0000, QWK: 0.9107


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.7711, Val F1: 0.0000, QWK: 0.9211


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.7908, Val F1: 0.0000, QWK: 0.9211


  return forward_call(*args, **kwargs)


Epoch 9: Train Loss: 0.0000, Val Loss: 0.8293, Val F1: 0.0000, QWK: 0.9219


  return forward_call(*args, **kwargs)


Epoch 10: Train Loss: 0.0000, Val Loss: 0.8389, Val F1: 0.0000, QWK: 0.9243


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.0000, Val Loss: 0.8285, Val F1: 0.0000, QWK: 0.9251


  return forward_call(*args, **kwargs)


Epoch 12: Train Loss: 0.0000, Val Loss: 0.8581, Val F1: 0.0000, QWK: 0.9267


  return forward_call(*args, **kwargs)


Epoch 13: Train Loss: 0.0000, Val Loss: 0.8715, Val F1: 0.0000, QWK: 0.9261


  return forward_call(*args, **kwargs)


Epoch 14: Train Loss: 0.0000, Val Loss: 0.8720, Val F1: 0.0000, QWK: 0.9266


  return forward_call(*args, **kwargs)


Epoch 14: Train Loss: 0.5886, Val Loss: 0.8720, Val F1: 0.0000, QWK: 0.9266
✅ Trial checkpoint saved: ./checkpoints_roberta/trial_3/final_epoch_14.ckpt
✅ Model files saved in: ./checkpoints_roberta/trial_3
📊 Trial 3 score: 0.9266 (Best: 0.9298)
GPU Memory Used: 4.02 GB


0,1
Epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Accuracy,▁▃▇▇▇▇█▇███████
Validation Adjacent Accuracy,▁▃▆▆▇▇▇████████
Validation F1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,█▇▂▁▂▃▁▂▃▃▃▃▄▄▄

0,1
Epoch,14.0
Learning_Rate,5e-05
Stage,1.0
Train Accuracy,0.0
Train Loss,0.58857
Unfrozen_Layers,12.0
Validation Accuracy,0.82924
Validation Adjacent Accuracy,0.97911
Validation F1,0.0
Validation Loss,0.872


[I 2025-08-01 13:46:31,411] Trial 3 finished with value: 0.9265959962592151 and parameters: {'learning_rate': 4.961108870268324e-05, 'batch_size': 32, 'label_smoothing': 0.0927589445524373, 'num_epochs': 14, 'warmup_ratio': 0.1435876918544712, 'weight_decay': 0.1262573646443622, 'attention_dropout': 0.3141810449276149, 'hidden_dropout': 0.3807897429227929}. Best is trial 2 with value: 0.9297840127669785.


TRIAL 4 - TESTING THESE PARAMETERS:
Learning Rate:      4.89e-05
Epochs:             15
Warmup Ratio:       0.112
Weight Decay:       0.105
Attention Dropout:  0.338
Hidden Dropout:     0.372


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.4768,1.01892,0.619463,0.61065,0.627085,0.660713,0.673718,0.558704,0.848361,0.848361,0.523838,0.584906,0.474317,0.474317,0.710775,0.808023,0.634421,0.634421,0.519699,0.596545,0.460392,0.460392,0.706357,0.587248,0.886076,0.886076,0.492541,0.907964,0.796601
2,0.8734,0.912207,0.70163,0.696382,0.722016,0.736449,0.775,0.730072,0.82582,0.82582,0.684652,0.7583,0.624044,0.624044,0.821216,0.873257,0.775028,0.775028,0.574516,0.674419,0.500392,0.500392,0.717608,0.574032,0.956962,0.956962,0.370209,0.939867,0.850139
3,0.6709,0.713282,0.802387,0.803202,0.805397,0.809808,0.810556,0.750436,0.881148,0.881148,0.733441,0.72293,0.744262,0.744262,0.837155,0.876847,0.8009,0.8009,0.797208,0.788344,0.806275,0.806275,0.850923,0.88843,0.816456,0.816456,0.242828,0.958458,0.897151
4,0.5422,0.807914,0.769566,0.767371,0.785304,0.789531,0.821918,0.845987,0.79918,0.79918,0.761305,0.799279,0.726776,0.726776,0.86024,0.872685,0.848144,0.848144,0.681309,0.755492,0.620392,0.620392,0.77509,0.653079,0.953165,0.953165,0.277485,0.955933,0.889681
5,0.489,0.706429,0.814781,0.813634,0.815086,0.82353,0.827447,0.815109,0.840164,0.840164,0.773563,0.815758,0.735519,0.735519,0.852058,0.842684,0.861642,0.861642,0.790024,0.8109,0.770196,0.770196,0.84638,0.790979,0.910127,0.910127,0.219417,0.968556,0.911455
6,0.4483,0.735655,0.814551,0.813257,0.809871,0.833208,0.828911,0.74876,0.928279,0.928279,0.765884,0.804087,0.731148,0.731148,0.856484,0.878251,0.835771,0.835771,0.791236,0.836538,0.750588,0.750588,0.845349,0.78172,0.920253,0.920253,0.220794,0.968556,0.912755
7,0.4375,0.72691,0.829699,0.82992,0.830199,0.837785,0.83965,0.798521,0.885246,0.885246,0.777598,0.766454,0.789071,0.789071,0.85564,0.890511,0.823397,0.823397,0.817247,0.824421,0.810196,0.810196,0.876023,0.871089,0.881013,0.881013,0.208171,0.964884,0.912705
8,0.3734,0.742309,0.822584,0.821532,0.818829,0.836319,0.831418,0.780576,0.889344,0.889344,0.774887,0.803048,0.748634,0.748634,0.860585,0.87822,0.843645,0.843645,0.802956,0.842377,0.767059,0.767059,0.855485,0.789925,0.932911,0.932911,0.2084,0.971311,0.91893
9,0.3368,0.771837,0.82465,0.823813,0.820685,0.837331,0.83877,0.769231,0.922131,0.922131,0.778608,0.821602,0.739891,0.739891,0.841111,0.830955,0.851519,0.851519,0.814726,0.831699,0.798431,0.798431,0.862133,0.849938,0.874684,0.874684,0.203351,0.974983,0.920485
10,0.3239,0.741448,0.83888,0.838583,0.841604,0.843106,0.850575,0.867804,0.834016,0.834016,0.804838,0.809735,0.8,0.8,0.843127,0.809524,0.87964,0.87964,0.827809,0.854045,0.803137,0.803137,0.882536,0.866911,0.898734,0.898734,0.185219,0.977507,0.926824


Epoch 1: Train Loss: 0.0000, Val Loss: 1.0189, Val F1: 0.0000, QWK: 0.7966


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 0.9122, Val F1: 0.0000, QWK: 0.8501


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.7133, Val F1: 0.0000, QWK: 0.8972


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.8079, Val F1: 0.0000, QWK: 0.8897


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.7064, Val F1: 0.0000, QWK: 0.9115


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.7357, Val F1: 0.0000, QWK: 0.9128


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.7269, Val F1: 0.0000, QWK: 0.9127


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.7423, Val F1: 0.0000, QWK: 0.9189


  return forward_call(*args, **kwargs)


Epoch 9: Train Loss: 0.0000, Val Loss: 0.7718, Val F1: 0.0000, QWK: 0.9205


  return forward_call(*args, **kwargs)


Epoch 10: Train Loss: 0.0000, Val Loss: 0.7414, Val F1: 0.0000, QWK: 0.9268


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.0000, Val Loss: 0.7965, Val F1: 0.0000, QWK: 0.9251


  return forward_call(*args, **kwargs)


Epoch 12: Train Loss: 0.0000, Val Loss: 0.7832, Val F1: 0.0000, QWK: 0.9269


  return forward_call(*args, **kwargs)


Epoch 13: Train Loss: 0.0000, Val Loss: 0.8346, Val F1: 0.0000, QWK: 0.9261


  return forward_call(*args, **kwargs)


Epoch 14: Train Loss: 0.0000, Val Loss: 0.8367, Val F1: 0.0000, QWK: 0.9262


  return forward_call(*args, **kwargs)


Epoch 14: Train Loss: 0.4909, Val Loss: 0.8367, Val F1: 0.0000, QWK: 0.9262
✅ Trial checkpoint saved: ./checkpoints_roberta/trial_4/final_epoch_14.ckpt
✅ Model files saved in: ./checkpoints_roberta/trial_4
📊 Trial 4 score: 0.9262 (Best: 0.9298)
GPU Memory Used: 4.02 GB


0,1
Epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Accuracy,▁▄▇▆▇▇█▇███████
Validation Adjacent Accuracy,▁▄▆▆▇▇▇▇███████
Validation F1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,█▆▁▃▁▂▁▂▂▂▃▃▄▄▄

0,1
Epoch,14.0
Learning_Rate,5e-05
Stage,1.0
Train Accuracy,0.0
Train Loss,0.49091
Unfrozen_Layers,12.0
Validation Accuracy,0.82717
Validation Adjacent Accuracy,0.97866
Validation F1,0.0
Validation Loss,0.83673


[I 2025-08-01 14:05:13,435] Trial 4 finished with value: 0.9261825007773399 and parameters: {'learning_rate': 4.8907197924823515e-05, 'batch_size': 32, 'label_smoothing': 0.0582946831507724, 'num_epochs': 15, 'warmup_ratio': 0.11185140937752469, 'weight_decay': 0.10507500742800172, 'attention_dropout': 0.33779944126404315, 'hidden_dropout': 0.37190455241829984}. Best is trial 2 with value: 0.9297840127669785.


TRIAL 5 - TESTING THESE PARAMETERS:
Learning Rate:      4.31e-05
Epochs:             11
Warmup Ratio:       0.145
Weight Decay:       0.053
Attention Dropout:  0.351
Hidden Dropout:     0.302


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,No log,1.164158,0.603626,0.59463,0.611178,0.634909,0.698094,0.683694,0.713115,0.713115,0.555968,0.540206,0.572678,0.572678,0.67002,0.606005,0.749156,0.749156,0.471028,0.582659,0.395294,0.395294,0.690141,0.643326,0.744304,0.744304,0.512509,0.903144,0.764208
2,1.423000,0.968847,0.738352,0.738255,0.759181,0.739717,0.741339,0.849206,0.657787,0.657787,0.69828,0.667331,0.73224,0.73224,0.802152,0.855867,0.754781,0.754781,0.68551,0.696039,0.675294,0.675294,0.795872,0.727463,0.878481,0.878481,0.333027,0.936424,0.84956
3,1.038200,0.916609,0.78196,0.78117,0.77659,0.800121,0.779964,0.692063,0.893443,0.893443,0.703064,0.717045,0.689617,0.689617,0.838184,0.868516,0.809899,0.809899,0.760331,0.803493,0.721569,0.721569,0.841852,0.801833,0.886076,0.886076,0.27014,0.95272,0.88825
4,0.901000,0.863984,0.818453,0.819094,0.826925,0.820684,0.826667,0.827515,0.82582,0.82582,0.772801,0.763326,0.782514,0.782514,0.856629,0.917738,0.80315,0.80315,0.797246,0.778193,0.817255,0.817255,0.861059,0.847853,0.874684,0.874684,0.227909,0.956851,0.900344
5,0.785500,0.887449,0.811109,0.810791,0.814096,0.820875,0.826004,0.774194,0.885246,0.885246,0.760046,0.800484,0.723497,0.723497,0.854935,0.906683,0.808774,0.808774,0.786292,0.772315,0.800784,0.800784,0.85003,0.816803,0.886076,0.886076,0.233417,0.958917,0.901759
6,0.726300,0.895308,0.816387,0.816191,0.813086,0.828971,0.81659,0.742044,0.907787,0.907787,0.746252,0.758465,0.734426,0.734426,0.855781,0.910013,0.807649,0.807649,0.80863,0.824104,0.793725,0.793725,0.864602,0.830805,0.901266,0.901266,0.224007,0.962589,0.90869
7,0.694800,0.884183,0.821896,0.821501,0.819183,0.831924,0.826794,0.775583,0.885246,0.885246,0.763333,0.776271,0.75082,0.75082,0.853842,0.877672,0.831271,0.831271,0.809087,0.822528,0.796078,0.796078,0.869245,0.843862,0.896203,0.896203,0.215515,0.965343,0.911581
8,0.651500,0.909856,0.815469,0.813985,0.811213,0.831251,0.830332,0.772487,0.897541,0.897541,0.768091,0.802381,0.736612,0.736612,0.84637,0.846847,0.845894,0.845894,0.790928,0.833913,0.752157,0.752157,0.857814,0.800439,0.924051,0.924051,0.217581,0.969245,0.914801
9,0.627000,0.877915,0.827634,0.827244,0.832696,0.830999,0.846316,0.87013,0.82377,0.82377,0.786315,0.807604,0.76612,0.76612,0.828479,0.795855,0.863892,0.863892,0.816845,0.827697,0.806275,0.806275,0.878261,0.862195,0.894937,0.894937,0.200597,0.973606,0.919048
10,0.616000,0.916439,0.817305,0.81591,0.814112,0.832534,0.837607,0.780531,0.903689,0.903689,0.776498,0.82095,0.736612,0.736612,0.8434,0.83871,0.848144,0.848144,0.79233,0.82568,0.761569,0.761569,0.855279,0.804688,0.912658,0.912658,0.214138,0.970622,0.916491


Epoch 1: Train Loss: 0.0000, Val Loss: 1.1642, Val F1: 0.0000, QWK: 0.7642


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 0.9688, Val F1: 0.0000, QWK: 0.8496


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.9166, Val F1: 0.0000, QWK: 0.8882


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.8640, Val F1: 0.0000, QWK: 0.9003


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.8874, Val F1: 0.0000, QWK: 0.9018


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.8953, Val F1: 0.0000, QWK: 0.9087


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.8842, Val F1: 0.0000, QWK: 0.9116


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.9099, Val F1: 0.0000, QWK: 0.9148


  return forward_call(*args, **kwargs)


Epoch 9: Train Loss: 0.0000, Val Loss: 0.8779, Val F1: 0.0000, QWK: 0.9190


  return forward_call(*args, **kwargs)


Epoch 10: Train Loss: 0.0000, Val Loss: 0.9164, Val F1: 0.0000, QWK: 0.9165


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.0000, Val Loss: 0.9180, Val F1: 0.0000, QWK: 0.9171


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.7908, Val Loss: 0.9180, Val F1: 0.0000, QWK: 0.9171
✅ Trial checkpoint saved: ./checkpoints_roberta/trial_5/final_epoch_11.ckpt
✅ Model files saved in: ./checkpoints_roberta/trial_5
📊 Trial 5 score: 0.9171 (Best: 0.9298)
GPU Memory Used: 4.02 GB


0,1
Epoch,▁▂▂▃▄▅▅▆▇▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁▁▁▁
Validation Accuracy,▁▅▇█▇███████
Validation Adjacent Accuracy,▁▄▆▆▇▇▇█████
Validation F1,▁▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,█▃▂▁▂▂▁▂▁▂▂▂

0,1
Epoch,11.0
Learning_Rate,4e-05
Stage,1.0
Train Accuracy,0.0
Train Loss,0.79084
Unfrozen_Layers,12.0
Validation Accuracy,0.81662
Validation Adjacent Accuracy,0.97154
Validation F1,0.0
Validation Loss,0.91796


[I 2025-08-01 14:19:08,303] Trial 5 finished with value: 0.9170690001665863 and parameters: {'learning_rate': 4.310399711368251e-05, 'batch_size': 64, 'label_smoothing': 0.1443310085655225, 'num_epochs': 11, 'warmup_ratio': 0.1447948125423108, 'weight_decay': 0.05311444734390475, 'attention_dropout': 0.35114853217580355, 'hidden_dropout': 0.3018459114267167}. Best is trial 2 with value: 0.9297840127669785.


TRIAL 6 - TESTING THESE PARAMETERS:
Learning Rate:      4.74e-04
Epochs:             14
Warmup Ratio:       0.095
Weight Decay:       0.116
Attention Dropout:  0.364
Hidden Dropout:     0.365


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.2138,1.136578,0.591462,0.569042,0.627535,0.610007,0.576517,0.425097,0.895492,0.895492,0.354009,0.574939,0.255738,0.255738,0.759667,0.672444,0.872891,0.872891,0.619013,0.583739,0.658824,0.658824,0.51832,0.881459,0.367089,0.367089,0.527427,0.912325,0.75329
2,1.0969,1.109839,0.59697,0.577825,0.680736,0.579299,0.641473,0.608456,0.678279,0.678279,0.541958,0.580524,0.508197,0.508197,0.739742,0.772338,0.709786,0.709786,0.612443,0.499505,0.791373,0.791373,0.341969,0.942857,0.208861,0.208861,0.507918,0.908653,0.74621
3,1.3257,1.64997,0.242598,0.119674,0.130366,0.273474,0.412393,0.430804,0.395492,0.395492,0.0,0.0,0.0,0.0,0.36015,0.221028,0.971879,0.971879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.04751,0.731926,0.22849


Epoch 1: Train Loss: 0.0000, Val Loss: 1.1366, Val F1: 0.0000, QWK: 0.7533


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 1.1098, Val F1: 0.0000, QWK: 0.7462


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 1.6500, Val F1: 0.0000, QWK: 0.2285


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3: Train Loss: 1.2353, Val Loss: 1.6500, Val F1: 0.0000, QWK: 0.2285
✅ Trial checkpoint saved: ./checkpoints_roberta/trial_6/final_epoch_3.ckpt
✅ Model files saved in: ./checkpoints_roberta/trial_6
📊 Trial 6 score: 0.2285 (Best: 0.9298)
GPU Memory Used: 4.02 GB


0,1
Epoch,▁▅██
Learning_Rate,▁▁▁▁
Stage,▁▁▁▁
Train Accuracy,▁▁▁▁
Train Loss,▁▁▁█
Unfrozen_Layers,▁▁▁▁
Validation Accuracy,██▁▁
Validation Adjacent Accuracy,██▁▁
Validation F1,▁▁▁▁
Validation Loss,▁▁██

0,1
Epoch,3.0
Learning_Rate,0.00047
Stage,1.0
Train Accuracy,0.0
Train Loss,1.23531
Unfrozen_Layers,12.0
Validation Accuracy,0.2426
Validation Adjacent Accuracy,0.73193
Validation F1,0.0
Validation Loss,1.64997


[I 2025-08-01 14:23:35,085] Trial 6 finished with value: 0.22849048342050216 and parameters: {'learning_rate': 0.0004739120765367586, 'batch_size': 32, 'label_smoothing': 0.09736710601504883, 'num_epochs': 14, 'warmup_ratio': 0.09519916627929559, 'weight_decay': 0.11582745742381222, 'attention_dropout': 0.36400909462332215, 'hidden_dropout': 0.36514181786858063}. Best is trial 2 with value: 0.9297840127669785.


TRIAL 7 - TESTING THESE PARAMETERS:
Learning Rate:      5.91e-05
Epochs:             11
Warmup Ratio:       0.135
Weight Decay:       0.080
Attention Dropout:  0.353
Hidden Dropout:     0.325


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,No log,1.129369,0.615791,0.604197,0.631996,0.6479,0.724919,0.765376,0.688525,0.688525,0.58992,0.619735,0.562842,0.562842,0.702095,0.643258,0.772778,0.772778,0.452303,0.556064,0.381176,0.381176,0.681137,0.575546,0.834177,0.834177,0.482672,0.915768,0.789444
2,1.317600,0.991077,0.720909,0.720939,0.745613,0.727265,0.737778,0.805825,0.680328,0.680328,0.689802,0.674322,0.706011,0.706011,0.783387,0.906805,0.689539,0.689539,0.663221,0.672581,0.654118,0.654118,0.769479,0.668534,0.906329,0.906329,0.352077,0.935965,0.846916
3,0.996600,0.878306,0.794124,0.793935,0.791605,0.808704,0.790323,0.702229,0.903689,0.903689,0.714286,0.735805,0.693989,0.693989,0.846523,0.90629,0.794151,0.794151,0.78043,0.791768,0.769412,0.769412,0.851038,0.821934,0.882278,0.882278,0.257976,0.95272,0.891349
4,0.843700,0.840612,0.825109,0.825901,0.837022,0.822083,0.831557,0.866667,0.79918,0.79918,0.787783,0.760163,0.817486,0.817486,0.856814,0.876471,0.83802,0.83802,0.810997,0.790179,0.832941,0.832941,0.855826,0.891632,0.822785,0.822785,0.211613,0.965343,0.909468
5,0.746500,0.869154,0.816158,0.815879,0.820301,0.824423,0.841901,0.799263,0.889344,0.889344,0.77187,0.809353,0.737705,0.737705,0.847239,0.886839,0.811024,0.811024,0.795107,0.775541,0.815686,0.815686,0.84901,0.830508,0.868354,0.868354,0.225155,0.962359,0.90503
6,0.683400,0.867117,0.827404,0.827024,0.826466,0.837683,0.831418,0.780576,0.889344,0.889344,0.774554,0.791334,0.75847,0.75847,0.862605,0.919745,0.812148,0.812148,0.812054,0.821171,0.803137,0.803137,0.869203,0.819507,0.925316,0.925316,0.210695,0.965343,0.91299
7,0.635500,0.841963,0.837732,0.837375,0.842103,0.841086,0.85291,0.881838,0.82582,0.82582,0.803954,0.807947,0.8,0.8,0.855388,0.849224,0.861642,0.861642,0.822355,0.837398,0.807843,0.807843,0.87046,0.834107,0.910127,0.910127,0.190498,0.974294,0.922602
8,0.602200,0.869107,0.835437,0.834571,0.833042,0.846687,0.856589,0.8125,0.905738,0.905738,0.79226,0.826603,0.760656,0.760656,0.851811,0.831726,0.872891,0.872891,0.821774,0.845643,0.799216,0.799216,0.871226,0.848739,0.894937,0.894937,0.192564,0.974524,0.923341
9,0.572000,0.875462,0.832683,0.832048,0.831174,0.841831,0.850643,0.82218,0.881148,0.881148,0.787913,0.807339,0.769399,0.769399,0.842802,0.82684,0.859393,0.859393,0.821443,0.844942,0.799216,0.799216,0.876695,0.854567,0.9,0.9,0.195547,0.974065,0.922107
10,0.560200,0.898477,0.826945,0.825724,0.822717,0.84068,0.84542,0.791071,0.907787,0.907787,0.77937,0.819277,0.743169,0.743169,0.844199,0.829533,0.859393,0.859393,0.813573,0.849701,0.780392,0.780392,0.866066,0.824,0.912658,0.912658,0.201744,0.973835,0.921099


Epoch 1: Train Loss: 0.0000, Val Loss: 1.1294, Val F1: 0.0000, QWK: 0.7894


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 0.9911, Val F1: 0.0000, QWK: 0.8469


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.8783, Val F1: 0.0000, QWK: 0.8913


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.8406, Val F1: 0.0000, QWK: 0.9095


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.8692, Val F1: 0.0000, QWK: 0.9050


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.8671, Val F1: 0.0000, QWK: 0.9130


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.8420, Val F1: 0.0000, QWK: 0.9226


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.8691, Val F1: 0.0000, QWK: 0.9233


  return forward_call(*args, **kwargs)


Epoch 9: Train Loss: 0.0000, Val Loss: 0.8755, Val F1: 0.0000, QWK: 0.9221


  return forward_call(*args, **kwargs)


Epoch 10: Train Loss: 0.0000, Val Loss: 0.8985, Val F1: 0.0000, QWK: 0.9211


  return forward_call(*args, **kwargs)


Epoch 10: Train Loss: 0.7548, Val Loss: 0.8985, Val F1: 0.0000, QWK: 0.9211
✅ Trial checkpoint saved: ./checkpoints_roberta/trial_7/final_epoch_10.ckpt
✅ Model files saved in: ./checkpoints_roberta/trial_7
📊 Trial 7 score: 0.9211 (Best: 0.9298)
GPU Memory Used: 4.02 GB


0,1
Epoch,▁▂▃▃▄▅▆▆▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁▁▁
Validation Accuracy,▁▄▇█▇██████
Validation Adjacent Accuracy,▁▃▅▇▇▇█████
Validation F1,▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,█▅▂▁▂▂▁▂▂▂▂

0,1
Epoch,10.0
Learning_Rate,6e-05
Stage,1.0
Train Accuracy,0.0
Train Loss,0.75478
Unfrozen_Layers,12.0
Validation Accuracy,0.82695
Validation Adjacent Accuracy,0.97384
Validation F1,0.0
Validation Loss,0.89848


[I 2025-08-01 14:36:47,626] Trial 7 finished with value: 0.9210994149655075 and parameters: {'learning_rate': 5.912385786218203e-05, 'batch_size': 64, 'label_smoothing': 0.1340152351552511, 'num_epochs': 11, 'warmup_ratio': 0.13457532845956638, 'weight_decay': 0.07983415554786308, 'attention_dropout': 0.3533393048077859, 'hidden_dropout': 0.32510017157630366}. Best is trial 2 with value: 0.9297840127669785.


Best hyperparameters saved to best_params_roberta.json
Best score: 0.9298
Best params: {'learning_rate': 0.0002410038500292598, 'batch_size': 32, 'label_smoothing': 0.09079127901743848, 'num_epochs': 11, 'warmup_ratio': 0.14435680747215307, 'weight_decay': 0.0849441280799004, 'attention_dropout': 0.3758622805072069, 'hidden_dropout': 0.3533269312284606}
Completed 8/8 trials


In [22]:
import time
import os

def calculate_model_metrics(model_path="./best_model_so_far"):
    """Calculate inference time and model size"""

    # 1. Calculate Model Size
    def get_model_size_mb(path):
        total_size = 0
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                total_size += os.path.getsize(filepath)
        return total_size / (1024 * 1024)  # Convert to MB

    model_size_mb = get_model_size_mb(model_path)

    # 2. Calculate Inference Time
    def measure_inference_time():
        # Load model for timing
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        tokenizer_for_timing = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")

        if device.type == "cuda":
            model = model.to(device)
        model.eval()

        # Sample text for timing
        sample_text = "COVID vaccines are helping to reduce hospitalizations significantly."

        # Warm-up runs (don't count these)
        for _ in range(3):
            inputs = tokenizer_for_timing(sample_text, return_tensors="pt", truncation=True, padding=True)
            if device.type == "cuda":
                inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                _ = model(**inputs)

        # Actual timing runs
        times = []
        num_runs = 10

        for _ in range(num_runs):
            inputs = tokenizer_for_timing(sample_text, return_tensors="pt", truncation=True, padding=True)
            if device.type == "cuda":
                inputs = {k: v.to(device) for k, v in inputs.items()}

            start_time = time.time()
            with torch.no_grad():
                _ = model(**inputs)
            end_time = time.time()

            times.append(end_time - start_time)

        # Return average inference time
        avg_inference_time = sum(times) / len(times)
        return avg_inference_time

    inference_time = measure_inference_time()

    return model_size_mb, inference_time

# Load the best model
best_model = AutoModelForSequenceClassification.from_pretrained(best_model_path)

# Create trainer for test evaluation
test_trainer = Trainer(
    model=best_model,
    args=TrainingArguments(
        output_dir="./temp",
        per_device_eval_batch_size=16,
        remove_unused_columns=False,
    ),
    tokenizer=tokenizer,
    compute_metrics=compute_detailed_metrics,  # Use the enhanced function!
)

# Evaluate on test set
test_results = test_trainer.evaluate(test_dataset)

# Calculate performance metrics
model_size_mb, inference_time_sec = calculate_model_metrics(best_model_path)

print("FINAL TEST SET EVALUATION - COMPREHENSIVE ANALYSIS")

# Standard Classification Metrics
print(test_results)
print("\nOVERALL CLASSIFICATION METRICS:")
print(f"Accuracy:           {test_results['eval_accuracy']:.4f}")
print(f"Validation QWK:     {test_results['eval_quadratic_weighted_kappa']:.4f}")
print(f"F1:                 {test_results['eval_f1']:.4f}")
print(f"Precision-Macro:    {test_results['eval_precision']:.4f}")
print(f"Recall-Macro:       {test_results['eval_recall']:.4f}")

# Ordinal-Aware Metrics
print("\nORDINAL-AWARE METRICS:")
print(f"Mean Absolute Error:        {test_results['eval_mae']:.4f}")
print(f"Adjacent Accuracy:          {test_results['eval_adjacent_accuracy']:.4f}")
print(f"Quadratic Weighted Kappa:   {test_results['eval_quadratic_weighted_kappa']:.4f}")

# Performance Metrics
print("\nPERFORMANCE METRICS:")
print(f"Inference Time:             {inference_time_sec:.4f} sec")
print(f"Model Size:                 {model_size_mb:.1f} MB")

# Per-Class Detailed Analysis (PRESERVED)
print("\nPER-CLASS PERFORMANCE ANALYSIS:")
sentiment_classes = ["extremely_negative", "negative", "neutral", "positive", "extremely_positive"]
class_display_names = ["Extremely Negative", "Negative", "Neutral", "Positive", "Extremely Positive"]

for i, (class_key, class_name) in enumerate(zip(sentiment_classes, class_display_names)):
    print(f"\n{class_name}:")
    print(f"  F1-Score:   {test_results.get(f'eval_f1_{class_key}', 0):.4f}")
    print(f"  Precision:  {test_results.get(f'eval_precision_{class_key}', 0):.4f}")
    print(f"  Recall:     {test_results.get(f'eval_accuracy_{class_key}', 0):.4f}")
    print(f"  Accuracy:   {test_results.get(f'eval_accuracy_{class_key}', 0):.4f}")

# Performance Analysis (PRESERVED + ENHANCED)
print("\nPERFORMANCE INSIGHTS:")
print(f"• MAE {test_results['eval_mae']:.2f}: On average off by {test_results['eval_mae']:.2f} sentiment levels")
print(f"• Adjacent Accuracy {test_results['eval_adjacent_accuracy']:.1%}: Predictions within 1 sentiment level")
print(f"• QWK {test_results['eval_quadratic_weighted_kappa']:.3f}: {'Excellent' if test_results['eval_quadratic_weighted_kappa'] > 0.8 else 'Good' if test_results['eval_quadratic_weighted_kappa'] > 0.6 else 'Moderate'} ordinal agreement")
# Added inference speed and efficiency insights
print(f"• Inference Speed: {1/inference_time_sec:.1f} predictions per second")
print(f"• Model Efficiency: {model_size_mb:.1f} MB storage required")

# Class Performance Analysis (PRESERVED)
f1_scores = [test_results.get(f'eval_f1_{class_key}', 0) for class_key in sentiment_classes]
best_class_idx = f1_scores.index(max(f1_scores))
worst_class_idx = f1_scores.index(min(f1_scores))

print(f"\nCLASS-SPECIFIC INSIGHTS:")
print(f"• Best performing class: {class_display_names[best_class_idx]} (F1: {f1_scores[best_class_idx]:.4f})")
print(f"• Most challenging class: {class_display_names[worst_class_idx]} (F1: {f1_scores[worst_class_idx]:.4f})")

# COVID-specific insights (PRESERVED)
extreme_avg = (f1_scores[0] + f1_scores[4]) / 2  # extremely negative + extremely positive
moderate_avg = (f1_scores[1] + f1_scores[3]) / 2  # negative + positive
neutral_score = f1_scores[2]

print(f"\nCOVID SENTIMENT INSIGHTS:")
print(f"• Extreme emotions (avg F1: {extreme_avg:.3f}): {'Challenging' if extreme_avg < 0.7 else 'Well-handled'}")
print(f"• Moderate emotions (avg F1: {moderate_avg:.3f}): {'Needs work' if moderate_avg < 0.75 else 'Good performance'}")
print(f"• Neutral sentiment (F1: {neutral_score:.3f}): {'Difficult to detect' if neutral_score < 0.8 else 'Well-identified'}")

# Final Summary with Key Metrics
print(f"\n📊 FINAL SUMMARY:")
print(f"F1: {test_results['eval_f1']:.4f} | QWK: {test_results['eval_quadratic_weighted_kappa']:.4f} | Inference: {inference_time_sec:.4f}s | Size: {model_size_mb:.1f}MB")

  test_trainer = Trainer(
  return forward_call(*args, **kwargs)


config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  return forward_call(*args, **kwargs)


FINAL TEST SET EVALUATION - COMPREHENSIVE ANALYSIS
{'eval_loss': 0.5976839065551758, 'eval_model_preparation_time': 0.0031, 'eval_accuracy': 0.833820093457944, 'eval_f1': 0.8329810902269634, 'eval_precision': 0.8310268173965134, 'eval_recall': 0.8477128360447125, 'eval_f1_extremely_negative': 0.8459770114942529, 'eval_precision_extremely_negative': 0.8, 'eval_recall_extremely_negative': 0.8975609756097561, 'eval_accuracy_extremely_negative': 0.8975609756097561, 'eval_f1_negative': 0.8036410923276983, 'eval_precision_negative': 0.8524137931034482, 'eval_recall_negative': 0.7601476014760148, 'eval_accuracy_negative': 0.7601476014760148, 'eval_f1_neutral': 0.8392101551480959, 'eval_precision_neutral': 0.7839262187088274, 'eval_recall_neutral': 0.9028831562974203, 'eval_accuracy_neutral': 0.9028831562974203, 'eval_f1_positive': 0.8244358833241607, 'eval_precision_positive': 0.8679026651216686, 'eval_recall_positive': 0.7851153039832285, 'eval_accuracy_positive': 0.7851153039832285, 'eval_f