## BERT fine-tuning + hyperparameter tuning

#### Setup

In [1]:
!pip install -q optuna
!pip install -q evaluate
!pip install -q emoji==0.6.0

In [2]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torch import nn
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainerCallback,
    EarlyStoppingCallback
)
import evaluate
import torch.nn.functional as F
import optuna
import wandb

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
wandb.login(key="<wandb key>")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmayachn3[0m ([33mmayachn3-maya-bondar[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Data Preparation
### Load Dataset

In [6]:
train = pd.read_csv("OOT_train.csv", encoding='latin-1')
val = pd.read_csv("OOT_val.csv", encoding='latin-1')
test = pd.read_csv("OOT_test.csv", encoding='latin-1')


In [7]:
# train = train.head(1000)
# val = val.head(1000)
# test = test.head(1000)

### Preprocessing

In [8]:
#encoding the labels numerically from Sentiment

ordinal_mapping = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

# map to ordinal labels
train["ordinal_label_id"] = train["Sentiment"].map(ordinal_mapping)
val["ordinal_label_id"] = val["Sentiment"].map(ordinal_mapping)
test["ordinal_label_id"] = test["Sentiment"].map(ordinal_mapping)



In [9]:
# Concat the relevant columns into one string with seperation.
# for example: "Tweet: my food stock is low | Location: Canada | Date: 2020-03-17 | URL: https://t.co/abcd"

# Function to build the input string from multiple columns
def build_augmented_input(row):
    parts = []

    if pd.notna(row.get('clean_tweet')):
        parts.append(f"{row['clean_tweet']}")

    if pd.notna(row.get('Location_standardized')) and row['Location_standardized'].lower() != 'unknown':
        parts.append(f"{row['Location_standardized']}")

    if pd.notna(row.get('TweetAt')):
        parts.append(f"{row['TweetAt']}")


    return " | ".join(parts)

# Apply to the DataFrames
train['model_input'] = train.apply(build_augmented_input, axis=1)
val['model_input'] = val.apply(build_augmented_input, axis=1)
test['model_input'] = test.apply(build_augmented_input, axis=1)

# Create  new DataFrames with only what's needed for modeling
formatted_train = train[['model_input', 'ordinal_label_id']].copy()
formatted_val = val[['model_input', 'ordinal_label_id']].copy()
formatted_test = test[['model_input', 'ordinal_label_id']].copy()



In [10]:
def balance_dataset(df, target_samples_per_class=5000):
    """Balance dataset by undersampling"""
    balanced_dfs = []

    print("Original class distribution:")
    print(df['ordinal_label_id'].value_counts().sort_index())

    for class_id in range(5):
        class_data = df[df['ordinal_label_id'] == class_id]

        if len(class_data) > target_samples_per_class:
            class_data = class_data.sample(n=target_samples_per_class, random_state=42)
            print(f"Class {class_id}: {len(class_data)} samples (undersampled)")
        else:
            print(f"Class {class_id}: {len(class_data)} samples (kept all)")

        balanced_dfs.append(class_data)

    balanced_df = pd.concat(balanced_dfs, ignore_index=True).sample(frac=1, random_state=42)

    print(f"Balanced dataset: {len(balanced_df)} total samples")
    print("New distribution:")
    print(balanced_df['ordinal_label_id'].value_counts().sort_index())

    return balanced_df

# Apply balancing to training data
formatted_train = balance_dataset(formatted_train, target_samples_per_class=5000)

Original class distribution:
ordinal_label_id
0     5175
1     9230
2     6784
3    10140
4     5845
Name: count, dtype: int64
Class 0: 5000 samples (undersampled)
Class 1: 5000 samples (undersampled)
Class 2: 5000 samples (undersampled)
Class 3: 5000 samples (undersampled)
Class 4: 5000 samples (undersampled)
Balanced dataset: 25000 total samples
New distribution:
ordinal_label_id
0    5000
1    5000
2    5000
3    5000
4    5000
Name: count, dtype: int64


### Tokenization

## Model
### Define Model

In [11]:
model_name = "finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_data(data, max_length=128):
    return tokenizer(
        data['model_input'].tolist(),
        truncation=True,
        padding=False,
        max_length=max_length,
        add_special_tokens=True,
        return_attention_mask=True,
        return_token_type_ids=False
    )

train_encodings = tokenize_data(formatted_train)
val_encodings = tokenize_data(formatted_val)
test_encodings = tokenize_data(formatted_test)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
## define a PyTorch Dataset
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels  # Should be integers

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # For training
        return item

    def __len__(self):
        return len(self.labels)

# Convert labels to integers if not already
train_labels = formatted_train['ordinal_label_id'].tolist()
val_labels = formatted_val['ordinal_label_id'].tolist()
test_labels = formatted_test['ordinal_label_id'].tolist()


train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
test_dataset = TweetDataset(test_encodings, test_labels)


In [13]:
# define mapping between label id and sentiment for later use and conveniency
ordinal_label2id = ordinal_mapping
ordinal_id2label = {v: k for k, v in ordinal_mapping.items()}

In [14]:
def compute_detailed_metrics(eval_pred):
    """Enhanced metrics using HuggingFace Evaluate library"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Load HuggingFace metrics (cached after first load)
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    # Compute standard classification metrics
    results = {}

    # Basic metrics
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average='macro'))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average='weighted'))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average='macro'))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average='macro'))

    # Per-class F1 scores (HF doesn't have this built-in, so keep custom)
    f1_per_class = f1_score(labels, predictions, average=None)
    for i, class_name in enumerate(['extremely_negative', 'negative', 'neutral', 'positive', 'extremely_positive']):
        results[f'f1_{class_name}'] = f1_per_class[i]

        # Per-class precision and recall
        precision_per_class = precision_score(labels, predictions, average=None, zero_division=0)
        recall_per_class = recall_score(labels, predictions, average=None, zero_division=0)
        results[f'precision_{class_name}'] = precision_per_class[i]
        results[f'recall_{class_name}'] = recall_per_class[i]

        # Per-class accuracy
        class_mask = (labels == i)
        if class_mask.sum() > 0:
            results[f'accuracy_{class_name}'] = accuracy_score(labels[class_mask], predictions[class_mask])
        else:
            results[f'accuracy_{class_name}'] = 0.0

    # Custom ordinal metrics (HF doesn't have these)
    results['mae'] = np.mean(np.abs(predictions - labels))
    results['adjacent_accuracy'] = np.sum(np.abs(predictions - labels) <= 1) / len(labels)

    # Quadratic Weighted Kappa (custom)
    from sklearn.metrics import cohen_kappa_score
    try:
        qwk = cohen_kappa_score(labels, predictions, weights='quadratic')
        results['quadratic_weighted_kappa'] = qwk
    except:
        results['quadratic_weighted_kappa'] = 0.0

    return results

In [15]:
def find_optimal_batch_size(base_batch_size):
    """Find the largest batch size that fits in GPU memory"""
    if device.type == "cpu":
        return base_batch_size

    # Try larger batch sizes for GPU
    for multiplier in [4, 3, 2, 1]:
        try_batch_size = base_batch_size * multiplier
        try:
            # Test if this batch size fits
            dummy_input = torch.randn(try_batch_size, 128, 768, device=device)
            dummy_output = torch.randn(try_batch_size, 5, device=device)
            del dummy_input, dummy_output
            torch.cuda.empty_cache() if device.type == "cuda" else None
            return try_batch_size
        except RuntimeError:  # Out of memory
            continue
    return base_batch_size

In [16]:
class SimpleMetricsLogger(TrainerCallback):
    """Simple callback to log detailed metrics every epoch"""

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None or not wandb.run:
            return

        # Only log when we have evaluation metrics (after each epoch)
        if 'eval_loss' in logs:
            current_epoch = int(state.epoch)

            # Get current learning rate
            current_lr = args.learning_rate
            if 'learning_rate' in logs:
                current_lr = logs['learning_rate']

            #Get training loss from state history
            train_loss = 0
            if state.log_history:
                # Find the most recent training loss
                for log_entry in reversed(state.log_history):
                    if 'train_loss' in log_entry:
                        train_loss = log_entry['train_loss']
                        break

            detailed_metrics = {
                "Epoch": current_epoch,
                "Stage": 1,
                "Unfrozen_Layers": 12,
                "Train Loss": train_loss,
                "Train Accuracy": 0,  # Usually not computed during training
                "Validation Loss": logs.get('eval_loss', 0),
                "Validation Accuracy": logs.get('eval_accuracy', 0),
                "Validation Precision": logs.get('eval_precision_macro', 0),
                "Validation Recall": logs.get('eval_recall_macro', 0),
                "Validation F1": logs.get('eval_f1_macro', 0),
                "Validation MAE": logs.get('eval_mae', 0),
                "Validation Adjacent Accuracy": logs.get('eval_adjacent_accuracy', 0),
                "Validation QWK": logs.get('eval_quadratic_weighted_kappa', 0),
                "Learning_Rate": current_lr,
            }

            # Log to WandB
            wandb.log(detailed_metrics)

            #Print progress to console
            print(f"Epoch {current_epoch}: "
                  f"Train Loss: {train_loss:.4f}, "  # ← Now shows real values
                  f"Val Loss: {logs.get('eval_loss', 0):.4f}, "
                  f"Val F1: {logs.get('eval_f1_macro', 0):.4f}, "
                  f"QWK: {logs.get('eval_quadratic_weighted_kappa', 0):.4f}")

In [17]:
def save_training_checkpoint(model, optimizer, epoch, loss, trial_params, filepath, trial_number, current_score, trainer):
    """Save complete training checkpoint and handle best model updates"""
    global best_score, best_model_path

    # Get the trial directory from filepath
    trial_dir = os.path.dirname(filepath)

    # Save trial checkpoint
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        # 'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'trial_params': trial_params,
        'model_config': model.config.to_dict(),
        'tokenizer_name': model_name,
        'current_score': current_score,  # Add score to checkpoint
        'trial_number': trial_number,
    }

    # Save all trial files in the same directory
    torch.save(checkpoint, filepath)
    torch.save(model.state_dict(), os.path.join(trial_dir, 'model_bert_weights.pt'))
    torch.save(model, os.path.join(trial_dir, 'model_bert.pt'))

    print(f"✅ Trial checkpoint saved: {filepath}")
    print(f"✅ Model files saved in: {trial_dir}")

    # Update best model if needed
    if current_score > best_score:
        best_score = current_score

        # Save HuggingFace format to best model directory
        trainer.save_model(best_model_path)

        # Also save our custom format in best model directory
        os.makedirs(best_model_path, exist_ok=True)
        best_checkpoint_path = os.path.join(best_model_path, 'best_checkpoint.ckpt')
        best_weights_path = os.path.join(best_model_path, 'model_bert_weights.pt')
        best_model_file_path = os.path.join(best_model_path, 'model_bert.pt')

        torch.save(checkpoint, best_checkpoint_path)
        torch.save(model.state_dict(), best_weights_path)
        torch.save(model, best_model_file_path)

        print(f"🏆 New best model saved! Score: {current_score:.4f} (Trial {trial_number})")
        print(f"🏆 Best model files saved in: {best_model_path}")

        # Optional: Log to W&B
        # wandb.log({
        #     "best_score_so_far": current_score,
        #     "best_trial_number": trial_number,
        # })
    else:
        print(f"📊 Trial {trial_number} score: {current_score:.4f} (Best: {best_score:.4f})")

In [18]:
# Global variables to track best model
best_score = 0.0
best_model_path = "./best_bert_model_so_far"

### Hyperparameter Tuning

In [19]:
def objective(trial):
    """Clean, organized objective function for Optuna hyperparameter optimization"""
    global best_score, best_model_path

    # === GPU MEMORY CLEANUP ===
    if device.type == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

    # === HYPERPARAMETER SAMPLING ===
    # Core training parameters
    learning_rate = trial.suggest_float("learning_rate", 3e-5, 5e-4, log=True)
    base_batch_size = trial.suggest_categorical("batch_size", [16,32,64])
    label_smoothing = trial.suggest_float("label_smoothing", 0.05, 0.15)
    num_epochs = trial.suggest_int("num_epochs", 10, 15)

    # Advanced optimization parameters
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.05, 0.15)
    weight_decay = trial.suggest_float("weight_decay", 0.05, 0.15)

    # Model architecture parameters
    attention_dropout = trial.suggest_float("attention_dropout", 0.3, 0.4)
    hidden_dropout = trial.suggest_float("hidden_dropout", 0.3, 0.4)

    #R-drop parameter
    # rdrop_alpha = trial.suggest_float("rdrop_alpha", 0.0, 1.0)

    # PRINT CHOSEN PARAMETERS
    print(f"TRIAL {trial.number} - TESTING THESE PARAMETERS:")
    print(f"Learning Rate:      {learning_rate:.2e}")
    print(f"Epochs:             {num_epochs}")
    print(f"Warmup Ratio:       {warmup_ratio:.3f}")
    print(f"Weight Decay:       {weight_decay:.3f}")
    print(f"Attention Dropout:  {attention_dropout:.3f}")
    print(f"Hidden Dropout:     {hidden_dropout:.3f}")


    # Optimize batch size for available hardware
    batch_size = find_optimal_batch_size(base_batch_size)


    # === EXPERIMENT TRACKING SETUP ===
    wandb.init(
        project="covid-tweet-sentiment-hf-bert-regularloss",
        name=f"trial_{trial.number}",
        config={
            "learning_rate": learning_rate,
            "batch_size": batch_size,
            "num_epochs": num_epochs,
            "warmup_ratio": warmup_ratio,
            "weight_decay": weight_decay,
            "attention_dropout": attention_dropout,
            "hidden_dropout": hidden_dropout
            # "rdrop_alpha": rdrop_alpha ,
        },
        reinit=True
    )

    try:
        # === MODEL SETUP ===
        model = _setup_model(attention_dropout, hidden_dropout)

        # === TRAINING CONFIGURATION ===
        training_args = _create_training_args(
            trial_number=trial.number,
            learning_rate=learning_rate,
            batch_size=batch_size,
            num_epochs=num_epochs,
            warmup_ratio=warmup_ratio,
            weight_decay=weight_decay,
            label_smoothing_factor=label_smoothing
        )

        trial_params = {
            "learning_rate": learning_rate,
            "batch_size": batch_size,
            "num_epochs": num_epochs,
            "warmup_ratio": warmup_ratio,
            "weight_decay": weight_decay,
            "attention_dropout": attention_dropout,
            "hidden_dropout": hidden_dropout
        }

        trainer = _create_trainer(model, training_args, trial.number, trial_params)
        trainer.train()

        # Set Checkpoint per trial
        checkpoint_dir = f"./checkpoints_bert/trial_{trial.number}"
        os.makedirs(checkpoint_dir, exist_ok=True)

        final_epoch = int(trainer.state.epoch)
        checkpoint_path = f"{checkpoint_dir}/final_epoch_{final_epoch}.ckpt"

        # Get final training loss
        final_loss = 0
        if trainer.state.log_history:
            for log_entry in reversed(trainer.state.log_history):
                if 'train_loss' in log_entry:
                    final_loss = log_entry['train_loss']
                    break

        eval_results = trainer.evaluate()
        current_score = eval_results["eval_quadratic_weighted_kappa"]

        save_training_checkpoint(
            model=trainer.model,
            optimizer=None,
            epoch=final_epoch,
            loss=final_loss,
            trial_params=trial_params,
            filepath=checkpoint_path,
            trial_number=trial.number,
            current_score=current_score,
            trainer=trainer
        )

        # Log GPU usage if available
        if device.type == "cuda":
            print(f"GPU Memory Used: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")

        return current_score

    except Exception as e:
        print(f"Trial {trial.number} failed: {e}")
        raise optuna.exceptions.TrialPruned()

    finally:
        # === CLEANUP ===
        if device.type == "cuda":
            torch.cuda.empty_cache()
        wandb.finish()


def _setup_model(attention_dropout, hidden_dropout):
    """Setup and configure the model with dropout and freezing"""
    model = AutoModelForSequenceClassification.from_pretrained(
        "finiteautomata/bertweet-base-sentiment-analysis",
        num_labels=5,
        id2label=ordinal_id2label,
        label2id=ordinal_label2id,
        ignore_mismatched_sizes=True
    )

    # Apply dropout configuration
    model.config.attention_probs_dropout_prob = attention_dropout
    model.config.hidden_dropout_prob = hidden_dropout

    # GPU optimizations
    if device.type == "cuda":
        model.gradient_checkpointing_enable()

        model.to(device)

    return model


def _create_training_args(trial_number, learning_rate, batch_size, num_epochs, warmup_ratio, weight_decay, label_smoothing_factor=0.1):
    """Create optimized training arguments"""
    return TrainingArguments(
        # output_dir=f"./results/trial_{trial_number}",

        # Core training parameters
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,  # Larger eval batch
        learning_rate=learning_rate,
        label_smoothing_factor=label_smoothing_factor,

        # Learning rate scheduling
        lr_scheduler_type="cosine",
        warmup_ratio=warmup_ratio,

        # Optimization
        optim="adamw_torch",
        weight_decay=weight_decay,
        max_grad_norm=1.0,

        # Evaluation and saving
        eval_strategy="epoch",
        save_strategy="no",
        save_total_limit=1,
        # load_best_model_at_end=True,
        metric_for_best_model="eval_quadratic_weighted_kappa",
        greater_is_better=True,

        # Performance optimizations
        fp16=device.type == "cuda",
        # tf32=device.type == "cuda",
        dataloader_pin_memory=True,
        dataloader_persistent_workers=device.type == "cuda",
        dataloader_num_workers=2 if device.type == "cuda" else 0,
        dataloader_drop_last=False,
        group_by_length=True,
        gradient_accumulation_steps=1,
        dataloader_prefetch_factor=2 if device.type == "cuda" else None,

        # Logging
        logging_steps=100,
        report_to="wandb",
        remove_unused_columns=False,

        # Evaluation optimizations
        eval_accumulation_steps=None,
        prediction_loss_only=False,

    )

def _create_trainer(model, training_args, trial_number, trial_params):
    """ordinal loss trainer"""

    callbacks = [
        EarlyStoppingCallback(early_stopping_patience=2),
        SimpleMetricsLogger(),
        ]

    return Trainer(  # Uses ordinal loss always
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_detailed_metrics,
        tokenizer=tokenizer,
        callbacks=callbacks,
    )

In [20]:
from datetime import datetime
import json

def save_best_hyperparameters(study, model_name="bertweet_base"):
    """Save the best hyperparameters found by Optuna"""

    # Check if any trial completed successfully
    completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if not completed_trials:
        print("No completed trials found. Skipping saving best hyperparameters.")
        return None

    best_params = study.best_params
    best_score = study.best_value

    # Save to JSON file
    results = {
        "model_name": model_name,
        "best_score": best_score,
        "best_params": best_params,
        "timestamp": str(datetime.now()),
        "total_trials": len(study.trials),
        "completed_trials": len(completed_trials)
    }

    filename = f"best_params_{model_name}.json"
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"Best hyperparameters saved to {filename}")
    print(f"Best score: {best_score:.4f}")
    print(f"Best params: {best_params}")
    print(f"Completed {len(completed_trials)}/{len(study.trials)} trials")

    return results

## Training

In [21]:
# Run optimization

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=8)

bert_results = save_best_hyperparameters(study, "bertweet_base")


[I 2025-08-01 07:39:41,030] A new study created in memory with name: no-name-13c69f66-8008-465f-bdf4-85ddf2eaf009


TRIAL 0 - TESTING THESE PARAMETERS:
Learning Rate:      3.36e-05
Epochs:             12
Warmup Ratio:       0.079
Weight Decay:       0.077
Attention Dropout:  0.301
Hidden Dropout:     0.327




Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.1285,1.017624,0.711269,0.71139,0.743204,0.712159,0.760246,0.760246,0.760246,0.760246,0.666316,0.6417,0.692896,0.692896,0.771612,0.73401,0.813273,0.813273,0.683327,0.651494,0.718431,0.718431,0.710938,0.928571,0.575949,0.575949,0.359422,0.934588,0.833697
2,0.9114,0.890793,0.798715,0.798142,0.795248,0.815284,0.79516,0.687593,0.942623,0.942623,0.719491,0.764128,0.679781,0.679781,0.851088,0.866978,0.835771,0.835771,0.785091,0.793905,0.776471,0.776471,0.852564,0.863636,0.841772,0.841772,0.249943,0.956162,0.894932
3,0.7783,0.852766,0.817535,0.81817,0.839895,0.803928,0.788372,0.91129,0.694672,0.694672,0.772679,0.754953,0.791257,0.791257,0.867991,0.902795,0.835771,0.835771,0.802651,0.756419,0.854902,0.854902,0.858247,0.874016,0.843038,0.843038,0.226303,0.958228,0.898681
4,0.7252,0.895241,0.810879,0.810341,0.816922,0.824504,0.833333,0.807692,0.860656,0.860656,0.775185,0.808789,0.744262,0.744262,0.872226,0.934447,0.817773,0.817773,0.769604,0.793995,0.746667,0.746667,0.832965,0.739686,0.953165,0.953165,0.230663,0.960982,0.906755
5,0.6825,0.938338,0.79642,0.795675,0.805892,0.815249,0.836257,0.797398,0.879098,0.879098,0.775652,0.825926,0.731148,0.731148,0.856624,0.926702,0.7964,0.7964,0.742834,0.777207,0.711373,0.711373,0.810493,0.702226,0.958228,0.958228,0.243516,0.963277,0.904482
6,0.6401,0.867103,0.835437,0.835753,0.833765,0.842702,0.83871,0.780919,0.905738,0.905738,0.776087,0.771892,0.780328,0.780328,0.854848,0.872365,0.83802,0.83802,0.837998,0.842314,0.833725,0.833725,0.877922,0.901333,0.855696,0.855696,0.193023,0.972688,0.923897
7,0.6109,0.917472,0.823502,0.822853,0.815695,0.837953,0.804233,0.705882,0.934426,0.934426,0.745958,0.790698,0.706011,0.706011,0.861853,0.871264,0.852643,0.852643,0.822437,0.847049,0.799216,0.799216,0.880199,0.863581,0.897468,0.897468,0.2084,0.970163,0.918732
8,0.5929,0.904674,0.826027,0.825156,0.830274,0.827575,0.830472,0.871622,0.793033,0.793033,0.781232,0.809133,0.755191,0.755191,0.840737,0.810867,0.872891,0.872891,0.816294,0.83157,0.801569,0.801569,0.869513,0.828179,0.91519,0.91519,0.200367,0.975442,0.920538


Epoch 1: Train Loss: 0.0000, Val Loss: 1.0176, Val F1: 0.0000, QWK: 0.8337


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 0.8908, Val F1: 0.0000, QWK: 0.8949


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.8528, Val F1: 0.0000, QWK: 0.8987


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.8952, Val F1: 0.0000, QWK: 0.9068


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.9383, Val F1: 0.0000, QWK: 0.9045


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.8671, Val F1: 0.0000, QWK: 0.9239


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.9175, Val F1: 0.0000, QWK: 0.9187


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.9047, Val F1: 0.0000, QWK: 0.9205


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.7764, Val Loss: 0.9047, Val F1: 0.0000, QWK: 0.9205
✅ Trial checkpoint saved: ./checkpoints_bert/trial_0/final_epoch_8.ckpt
✅ Model files saved in: ./checkpoints_bert/trial_0
🏆 New best model saved! Score: 0.9205 (Trial 0)
🏆 Best model files saved in: ./best_bert_model_so_far
GPU Memory Used: 2.57 GB


0,1
Epoch,▁▂▃▄▅▆▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁
Validation Accuracy,▁▆▇▇▆█▇▇▇
Validation Adjacent Accuracy,▁▅▅▆▆█▇██
Validation F1,▁▁▁▁▁▁▁▁▁
Validation Loss,█▃▁▃▅▂▄▃▃

0,1
Epoch,8.0
Learning_Rate,3e-05
Stage,1.0
Train Accuracy,0.0
Train Loss,0.77637
Unfrozen_Layers,12.0
Validation Accuracy,0.82603
Validation Adjacent Accuracy,0.97544
Validation F1,0.0
Validation Loss,0.90467


[I 2025-08-01 07:52:49,980] Trial 0 finished with value: 0.9205380993626433 and parameters: {'learning_rate': 3.36347924961925e-05, 'batch_size': 16, 'label_smoothing': 0.1428269059035347, 'num_epochs': 12, 'warmup_ratio': 0.07863577327467691, 'weight_decay': 0.07710265336920982, 'attention_dropout': 0.301254181394481, 'hidden_dropout': 0.3268135201604385}. Best is trial 0 with value: 0.9205380993626433.


TRIAL 1 - TESTING THESE PARAMETERS:
Learning Rate:      2.53e-04
Epochs:             12
Warmup Ratio:       0.090
Weight Decay:       0.080
Attention Dropout:  0.398
Hidden Dropout:     0.310


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.0517,1.452927,0.522607,0.466102,0.694353,0.462909,0.147727,0.975,0.079918,0.079918,0.528808,0.44713,0.646995,0.646995,0.694457,0.585781,0.852643,0.852643,0.558976,0.5,0.633725,0.633725,0.183276,0.963855,0.101266,0.101266,0.570576,0.915538,0.680696
2,0.9919,1.035872,0.677989,0.67162,0.754896,0.667787,0.737557,0.823232,0.668033,0.668033,0.65815,0.551292,0.816393,0.816393,0.808631,0.816514,0.8009,0.8009,0.651694,0.614157,0.694118,0.694118,0.524469,0.969283,0.359494,0.359494,0.413128,0.916456,0.793912
3,0.8657,1.033858,0.72022,0.718611,0.757345,0.731358,0.76555,0.91954,0.655738,0.655738,0.694396,0.73652,0.656831,0.656831,0.84053,0.860849,0.821147,0.821147,0.619601,0.658429,0.585098,0.585098,0.74026,0.611386,0.937975,0.937975,0.363782,0.928391,0.832923
4,0.7972,0.92649,0.757861,0.757056,0.789624,0.755557,0.804284,0.766234,0.846311,0.846311,0.722133,0.705208,0.739891,0.739891,0.836578,0.879653,0.797525,0.797525,0.742877,0.673469,0.828235,0.828235,0.701727,0.923554,0.565823,0.565823,0.301125,0.946523,0.863228
5,0.7332,0.889307,0.781501,0.782069,0.812466,0.76539,0.77412,0.867684,0.69877,0.69877,0.739342,0.730277,0.748634,0.748634,0.849456,0.864802,0.834646,0.834646,0.770855,0.699234,0.858824,0.858824,0.778736,0.900332,0.686076,0.686076,0.269451,0.952031,0.876606
6,0.6715,0.899815,0.794354,0.793604,0.800174,0.801333,0.807339,0.803245,0.811475,0.811475,0.734694,0.763251,0.708197,0.708197,0.839667,0.889308,0.795276,0.795276,0.7645,0.758887,0.770196,0.770196,0.848485,0.786177,0.921519,0.921519,0.262336,0.948818,0.883845
7,0.5761,0.898636,0.790682,0.790145,0.789533,0.803444,0.802734,0.766791,0.842213,0.842213,0.738527,0.721585,0.756284,0.756284,0.830595,0.836758,0.824522,0.824522,0.760132,0.804024,0.720784,0.720784,0.84507,0.818505,0.873418,0.873418,0.25614,0.956162,0.893594
8,0.5387,0.95303,0.78655,0.784954,0.781477,0.803222,0.791394,0.728055,0.866803,0.866803,0.718925,0.737084,0.701639,0.701639,0.820432,0.771287,0.876265,0.876265,0.76393,0.834572,0.704314,0.704314,0.851461,0.836386,0.867089,0.867089,0.253615,0.962818,0.898197
9,0.4923,0.955846,0.788846,0.787406,0.800293,0.789315,0.791334,0.892031,0.711066,0.711066,0.742535,0.766279,0.720219,0.720219,0.801245,0.743738,0.868391,0.868391,0.774486,0.814719,0.738039,0.738039,0.842229,0.784699,0.908861,0.908861,0.243516,0.971081,0.901198
10,0.4622,0.966085,0.803076,0.801759,0.810721,0.802958,0.80442,0.872902,0.745902,0.745902,0.7444,0.784504,0.708197,0.708197,0.81289,0.755556,0.87964,0.87964,0.797586,0.819008,0.777255,0.777255,0.860759,0.821634,0.903797,0.903797,0.228139,0.971311,0.907275


Epoch 1: Train Loss: 0.0000, Val Loss: 1.4529, Val F1: 0.0000, QWK: 0.6807


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 1.0359, Val F1: 0.0000, QWK: 0.7939


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 1.0339, Val F1: 0.0000, QWK: 0.8329


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.9265, Val F1: 0.0000, QWK: 0.8632


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.8893, Val F1: 0.0000, QWK: 0.8766


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.8998, Val F1: 0.0000, QWK: 0.8838


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.8986, Val F1: 0.0000, QWK: 0.8936


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.9530, Val F1: 0.0000, QWK: 0.8982


  return forward_call(*args, **kwargs)


Epoch 9: Train Loss: 0.0000, Val Loss: 0.9558, Val F1: 0.0000, QWK: 0.9012


  return forward_call(*args, **kwargs)


Epoch 10: Train Loss: 0.0000, Val Loss: 0.9661, Val F1: 0.0000, QWK: 0.9073


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.0000, Val Loss: 0.9802, Val F1: 0.0000, QWK: 0.9122


  return forward_call(*args, **kwargs)


Epoch 12: Train Loss: 0.0000, Val Loss: 0.9773, Val F1: 0.0000, QWK: 0.9147


  return forward_call(*args, **kwargs)


Epoch 12: Train Loss: 0.6798, Val Loss: 0.9773, Val F1: 0.0000, QWK: 0.9147
✅ Trial checkpoint saved: ./checkpoints_bert/trial_1/final_epoch_12.ckpt
✅ Model files saved in: ./checkpoints_bert/trial_1
📊 Trial 1 score: 0.9147 (Best: 0.9205)
GPU Memory Used: 2.57 GB


0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Accuracy,▁▅▆▇▇████████
Validation Adjacent Accuracy,▁▁▂▅▅▅▆▆▇▇███
Validation F1,▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,█▃▃▁▁▁▁▂▂▂▂▂▂

0,1
Epoch,12.0
Learning_Rate,0.00025
Stage,1.0
Train Accuracy,0.0
Train Loss,0.67985
Unfrozen_Layers,12.0
Validation Accuracy,0.80514
Validation Adjacent Accuracy,0.97682
Validation F1,0.0
Validation Loss,0.97727


[I 2025-08-01 08:11:00,793] Trial 1 finished with value: 0.9147465517083717 and parameters: {'learning_rate': 0.00025347182708649116, 'batch_size': 16, 'label_smoothing': 0.1065678658979583, 'num_epochs': 12, 'warmup_ratio': 0.09017385634999035, 'weight_decay': 0.08017493106208261, 'attention_dropout': 0.3982050054406836, 'hidden_dropout': 0.31001996429156703}. Best is trial 0 with value: 0.9205380993626433.


TRIAL 2 - TESTING THESE PARAMETERS:
Learning Rate:      1.23e-04
Epochs:             15
Warmup Ratio:       0.060
Weight Decay:       0.061
Attention Dropout:  0.302
Hidden Dropout:     0.313


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,No log,1.020771,0.65389,0.649736,0.660863,0.695859,0.672023,0.522134,0.942623,0.942623,0.522727,0.544379,0.502732,0.502732,0.728792,0.850075,0.637795,0.637795,0.609669,0.710115,0.534118,0.534118,0.758774,0.677612,0.862025,0.862025,0.437457,0.921965,0.828379
2,1.195800,0.792641,0.793895,0.793587,0.802849,0.806438,0.824742,0.829876,0.819672,0.819672,0.766756,0.752632,0.781421,0.781421,0.853541,0.915058,0.799775,0.799775,0.744856,0.78355,0.709804,0.709804,0.816601,0.733132,0.921519,0.921519,0.250861,0.958228,0.897792
3,0.831600,0.788417,0.793895,0.793599,0.798298,0.803616,0.770903,0.65113,0.944672,0.944672,0.694825,0.762402,0.638251,0.638251,0.868164,0.889151,0.848144,0.848144,0.8006,0.766691,0.837647,0.837647,0.826816,0.922118,0.749367,0.749367,0.249484,0.959146,0.898169
4,0.670000,0.741727,0.831765,0.830998,0.829397,0.841585,0.838894,0.784314,0.901639,0.901639,0.782959,0.827251,0.743169,0.743169,0.862876,0.855249,0.870641,0.870641,0.815956,0.821798,0.810196,0.810196,0.870162,0.858374,0.882278,0.882278,0.199449,0.970852,0.919701
5,0.623400,0.818119,0.808354,0.807451,0.805865,0.818158,0.794667,0.701727,0.915984,0.915984,0.727814,0.799738,0.66776,0.66776,0.8379,0.850521,0.825647,0.825647,0.810749,0.793985,0.828235,0.828235,0.867997,0.883355,0.853165,0.853165,0.224466,0.969474,0.912123
6,0.560000,0.790186,0.821896,0.820529,0.816509,0.834186,0.822773,0.745424,0.918033,0.918033,0.750293,0.809102,0.699454,0.699454,0.844699,0.860981,0.829021,0.829021,0.820127,0.828663,0.811765,0.811765,0.873939,0.838372,0.912658,0.912658,0.2084,0.972458,0.918802


Epoch 1: Train Loss: 0.0000, Val Loss: 1.0208, Val F1: 0.0000, QWK: 0.8284


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 0.7926, Val F1: 0.0000, QWK: 0.8978


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.7884, Val F1: 0.0000, QWK: 0.8982


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.7417, Val F1: 0.0000, QWK: 0.9197


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.8181, Val F1: 0.0000, QWK: 0.9121


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.7902, Val F1: 0.0000, QWK: 0.9188


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.7382, Val Loss: 0.7902, Val F1: 0.0000, QWK: 0.9188
✅ Trial checkpoint saved: ./checkpoints_bert/trial_2/final_epoch_6.ckpt
✅ Model files saved in: ./checkpoints_bert/trial_2
📊 Trial 2 score: 0.9188 (Best: 0.9205)
GPU Memory Used: 4.13 GB


0,1
Epoch,▁▂▄▅▇██
Learning_Rate,▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁
Validation Accuracy,▁▇▇█▇██
Validation Adjacent Accuracy,▁▆▆████
Validation F1,▁▁▁▁▁▁▁
Validation Loss,█▂▂▁▃▂▂

0,1
Epoch,6.0
Learning_Rate,0.00012
Stage,1.0
Train Accuracy,0.0
Train Loss,0.73816
Unfrozen_Layers,12.0
Validation Accuracy,0.8219
Validation Adjacent Accuracy,0.97246
Validation F1,0.0
Validation Loss,0.79019


[I 2025-08-01 08:19:30,841] Trial 2 finished with value: 0.9188023074101209 and parameters: {'learning_rate': 0.00012345780336645804, 'batch_size': 64, 'label_smoothing': 0.09171342217547615, 'num_epochs': 15, 'warmup_ratio': 0.06015041835053567, 'weight_decay': 0.06133991608882185, 'attention_dropout': 0.3019927202194576, 'hidden_dropout': 0.31261729684360023}. Best is trial 0 with value: 0.9205380993626433.


TRIAL 3 - TESTING THESE PARAMETERS:
Learning Rate:      4.74e-04
Epochs:             10
Warmup Ratio:       0.098
Weight Decay:       0.134
Attention Dropout:  0.331
Hidden Dropout:     0.387


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.4214,1.742437,0.204039,0.069154,0.040808,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338925,0.204039,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.089282,0.706679,0.0
2,1.66,1.599605,0.204039,0.069154,0.040808,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338925,0.204039,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.089282,0.706679,0.0
3,1.611,1.611399,0.210007,0.072897,0.042001,0.2,0.0,0.0,0.0,0.0,0.347117,0.210007,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.445261,0.52605,0.0


Epoch 1: Train Loss: 0.0000, Val Loss: 1.7424, Val F1: 0.0000, QWK: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 1.5996, Val F1: 0.0000, QWK: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 1.6114, Val F1: 0.0000, QWK: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3: Train Loss: 1.5479, Val Loss: 1.6114, Val F1: 0.0000, QWK: 0.0000
✅ Trial checkpoint saved: ./checkpoints_bert/trial_3/final_epoch_3.ckpt
✅ Model files saved in: ./checkpoints_bert/trial_3
📊 Trial 3 score: 0.0000 (Best: 0.9205)
GPU Memory Used: 4.13 GB


0,1
Epoch,▁▅██
Learning_Rate,▁▁▁▁
Stage,▁▁▁▁
Train Accuracy,▁▁▁▁
Train Loss,▁▁▁█
Unfrozen_Layers,▁▁▁▁
Validation Accuracy,▁▁██
Validation Adjacent Accuracy,██▁▁
Validation F1,▁▁▁▁
Validation Loss,█▁▂▂

0,1
Epoch,3.0
Learning_Rate,0.00047
Stage,1.0
Train Accuracy,0.0
Train Loss,1.54792
Unfrozen_Layers,12.0
Validation Accuracy,0.21001
Validation Adjacent Accuracy,0.52605
Validation F1,0.0
Validation Loss,1.6114


[I 2025-08-01 08:24:52,114] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 0.000473801851608495, 'batch_size': 16, 'label_smoothing': 0.10161124967206855, 'num_epochs': 10, 'warmup_ratio': 0.09817653648196495, 'weight_decay': 0.13418021102228506, 'attention_dropout': 0.33100183170276276, 'hidden_dropout': 0.38740468538963047}. Best is trial 0 with value: 0.9205380993626433.


TRIAL 4 - TESTING THESE PARAMETERS:
Learning Rate:      2.12e-04
Epochs:             11
Warmup Ratio:       0.133
Weight Decay:       0.112
Attention Dropout:  0.331
Hidden Dropout:     0.389


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.3052,0.950134,0.722515,0.721148,0.718319,0.749896,0.732303,0.607287,0.922131,0.922131,0.613739,0.632985,0.595628,0.595628,0.811527,0.832151,0.791901,0.791901,0.694362,0.755535,0.642353,0.642353,0.780186,0.763636,0.797468,0.797468,0.35552,0.933211,0.847764
2,0.8906,1.173132,0.659169,0.651481,0.690783,0.704905,0.771041,0.690438,0.872951,0.872951,0.622871,0.702332,0.559563,0.559563,0.777563,0.910876,0.67829,0.67829,0.513686,0.613959,0.441569,0.441569,0.691269,0.536313,0.972152,0.972152,0.425063,0.925866,0.836103
3,0.786,0.937596,0.766583,0.764217,0.775084,0.7876,0.805247,0.793241,0.817623,0.817623,0.734671,0.749716,0.720219,0.720219,0.849156,0.880435,0.820022,0.820022,0.697128,0.782991,0.628235,0.628235,0.785789,0.669039,0.951899,0.951899,0.286436,0.952261,0.883301
4,0.7168,0.885185,0.790452,0.788794,0.785261,0.808644,0.787133,0.731107,0.852459,0.852459,0.730997,0.721277,0.740984,0.740984,0.859218,0.853496,0.865017,0.865017,0.753484,0.847209,0.678431,0.678431,0.834499,0.773218,0.906329,0.906329,0.260959,0.953867,0.890472
5,0.6584,0.879063,0.809961,0.808755,0.816861,0.812305,0.816594,0.873832,0.766393,0.766393,0.759551,0.781503,0.738798,0.738798,0.838816,0.818182,0.860517,0.860517,0.7872,0.803265,0.771765,0.771765,0.861865,0.807522,0.924051,0.924051,0.225843,0.966032,0.90721
6,0.6095,0.824388,0.820748,0.820562,0.82403,0.827383,0.843177,0.838057,0.848361,0.848361,0.779493,0.786429,0.772678,0.772678,0.828877,0.79001,0.871766,0.871766,0.806114,0.827415,0.785882,0.785882,0.868118,0.878238,0.858228,0.858228,0.207482,0.973835,0.917116
7,0.5671,0.829073,0.824191,0.824108,0.826446,0.827517,0.830421,0.83299,0.827869,0.827869,0.782276,0.783133,0.781421,0.781421,0.837644,0.820043,0.856018,0.856018,0.813492,0.823293,0.803922,0.803922,0.870558,0.872774,0.868354,0.868354,0.204269,0.973835,0.917909
8,0.5401,0.867521,0.817535,0.816192,0.816559,0.82959,0.831143,0.820359,0.842213,0.842213,0.78714,0.79865,0.775956,0.775956,0.842397,0.831325,0.853768,0.853768,0.790424,0.850814,0.738039,0.738039,0.852704,0.781646,0.937975,0.937975,0.210007,0.974524,0.919528
9,0.4968,0.848681,0.829011,0.828453,0.82736,0.837582,0.83871,0.801869,0.879098,0.879098,0.779932,0.815256,0.747541,0.747541,0.834654,0.786853,0.888639,0.888639,0.825101,0.85272,0.799216,0.799216,0.876747,0.880102,0.873418,0.873418,0.191875,0.980491,0.927639
10,0.4703,0.913533,0.81983,0.818122,0.820078,0.832386,0.85,0.830078,0.870902,0.870902,0.767606,0.828897,0.714754,0.714754,0.821918,0.773043,0.87739,0.87739,0.808793,0.858275,0.764706,0.764706,0.867725,0.810099,0.934177,0.934177,0.201744,0.980262,0.924717


Epoch 1: Train Loss: 0.0000, Val Loss: 0.9501, Val F1: 0.0000, QWK: 0.8478


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 1.1731, Val F1: 0.0000, QWK: 0.8361


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.9376, Val F1: 0.0000, QWK: 0.8833


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.8852, Val F1: 0.0000, QWK: 0.8905


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.8791, Val F1: 0.0000, QWK: 0.9072


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.8244, Val F1: 0.0000, QWK: 0.9171


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.8291, Val F1: 0.0000, QWK: 0.9179


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.8675, Val F1: 0.0000, QWK: 0.9195


  return forward_call(*args, **kwargs)


Epoch 9: Train Loss: 0.0000, Val Loss: 0.8487, Val F1: 0.0000, QWK: 0.9276


  return forward_call(*args, **kwargs)


Epoch 10: Train Loss: 0.0000, Val Loss: 0.9135, Val F1: 0.0000, QWK: 0.9247


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.0000, Val Loss: 0.8980, Val F1: 0.0000, QWK: 0.9284


  return forward_call(*args, **kwargs)


Epoch 11: Train Loss: 0.6679, Val Loss: 0.8980, Val F1: 0.0000, QWK: 0.9284
✅ Trial checkpoint saved: ./checkpoints_bert/trial_4/final_epoch_11.ckpt
✅ Model files saved in: ./checkpoints_bert/trial_4
🏆 New best model saved! Score: 0.9284 (Trial 4)
🏆 Best model files saved in: ./best_bert_model_so_far
GPU Memory Used: 4.13 GB


0,1
Epoch,▁▂▂▃▄▅▅▆▇▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁▁▁▁
Validation Accuracy,▄▁▅▆▇███████
Validation Adjacent Accuracy,▂▁▄▄▆▇▇▇████
Validation F1,▁▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,▄█▃▂▂▁▁▂▁▃▂▂

0,1
Epoch,11.0
Learning_Rate,0.00021
Stage,1.0
Train Accuracy,0.0
Train Loss,0.66787
Unfrozen_Layers,12.0
Validation Accuracy,0.82855
Validation Adjacent Accuracy,0.98233
Validation F1,0.0
Validation Loss,0.89796


[I 2025-08-01 08:39:40,696] Trial 4 finished with value: 0.9284241099301461 and parameters: {'learning_rate': 0.00021229985961337253, 'batch_size': 32, 'label_smoothing': 0.11524364144454653, 'num_epochs': 11, 'warmup_ratio': 0.13273796418047804, 'weight_decay': 0.1121797776236029, 'attention_dropout': 0.33099760772482156, 'hidden_dropout': 0.38867822735790303}. Best is trial 4 with value: 0.9284241099301461.


TRIAL 5 - TESTING THESE PARAMETERS:
Learning Rate:      1.34e-04
Epochs:             15
Warmup Ratio:       0.105
Weight Decay:       0.057
Attention Dropout:  0.313
Hidden Dropout:     0.371


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,0.9978,0.860762,0.744779,0.744574,0.770725,0.729283,0.681704,0.877419,0.557377,0.557377,0.695565,0.645463,0.754098,0.754098,0.808774,0.808774,0.808774,0.808774,0.726291,0.714177,0.738824,0.738824,0.797436,0.807792,0.787342,0.787342,0.31765,0.943769,0.852656
2,0.8424,0.843558,0.768878,0.764716,0.809417,0.731901,0.604716,0.935622,0.446721,0.446721,0.714961,0.687879,0.744262,0.744262,0.861453,0.855716,0.867267,0.867267,0.771843,0.702964,0.855686,0.855686,0.800816,0.864905,0.74557,0.74557,0.290567,0.946293,0.856863
3,0.7098,0.810686,0.795731,0.795864,0.823558,0.78635,0.812236,0.836957,0.788934,0.788934,0.767613,0.739615,0.797814,0.797814,0.86424,0.888361,0.841395,0.841395,0.782267,0.718791,0.858039,0.858039,0.763473,0.934066,0.64557,0.64557,0.249943,0.95731,0.888109
4,0.647,0.82292,0.795042,0.794728,0.835013,0.779743,0.815135,0.8627,0.772541,0.772541,0.756912,0.800244,0.718033,0.718033,0.866286,0.880372,0.852643,0.852643,0.782404,0.684087,0.913725,0.913725,0.765283,0.947664,0.641772,0.641772,0.252008,0.955703,0.884113
5,0.5979,0.761421,0.826027,0.826103,0.838531,0.817752,0.809365,0.887531,0.743852,0.743852,0.77444,0.774017,0.774863,0.774863,0.867689,0.876147,0.859393,0.859393,0.816296,0.786337,0.848627,0.848627,0.865311,0.868622,0.862025,0.862025,0.215745,0.961441,0.903402
6,0.5322,0.794335,0.82442,0.824136,0.822623,0.83275,0.836435,0.801126,0.875,0.875,0.784573,0.791111,0.778142,0.778142,0.840336,0.837054,0.843645,0.843645,0.813301,0.831286,0.796078,0.796078,0.861616,0.85254,0.870886,0.870886,0.206335,0.972688,0.916727
7,0.4868,0.816855,0.827634,0.828675,0.830043,0.830636,0.818792,0.769369,0.875,0.875,0.75921,0.742171,0.777049,0.777049,0.8681,0.897837,0.84027,0.84027,0.831721,0.820611,0.843137,0.843137,0.865952,0.920228,0.817722,0.817722,0.211613,0.963737,0.909727
8,0.4579,0.869482,0.808813,0.807356,0.799792,0.827781,0.798593,0.699538,0.930328,0.930328,0.730214,0.77451,0.69071,0.69071,0.8472,0.82,0.876265,0.876265,0.802022,0.866242,0.746667,0.746667,0.865891,0.838671,0.894937,0.894937,0.222401,0.97154,0.915033


Epoch 1: Train Loss: 0.0000, Val Loss: 0.8608, Val F1: 0.0000, QWK: 0.8527


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 0.8436, Val F1: 0.0000, QWK: 0.8569


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.8107, Val F1: 0.0000, QWK: 0.8881


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.8229, Val F1: 0.0000, QWK: 0.8841


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.7614, Val F1: 0.0000, QWK: 0.9034


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.7943, Val F1: 0.0000, QWK: 0.9167


  return forward_call(*args, **kwargs)


Epoch 7: Train Loss: 0.0000, Val Loss: 0.8169, Val F1: 0.0000, QWK: 0.9097


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.0000, Val Loss: 0.8695, Val F1: 0.0000, QWK: 0.9150


  return forward_call(*args, **kwargs)


Epoch 8: Train Loss: 0.6814, Val Loss: 0.8695, Val F1: 0.0000, QWK: 0.9150
✅ Trial checkpoint saved: ./checkpoints_bert/trial_5/final_epoch_8.ckpt
✅ Model files saved in: ./checkpoints_bert/trial_5
📊 Trial 5 score: 0.9150 (Best: 0.9284)
GPU Memory Used: 4.13 GB


0,1
Epoch,▁▂▃▄▅▆▇██
Learning_Rate,▁▁▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁▁▁
Validation Accuracy,▁▃▅▅███▆▆
Validation Adjacent Accuracy,▁▂▄▄▅█▆██
Validation F1,▁▁▁▁▁▁▁▁▁
Validation Loss,▇▆▄▅▁▃▅██

0,1
Epoch,8.0
Learning_Rate,0.00013
Stage,1.0
Train Accuracy,0.0
Train Loss,0.68145
Unfrozen_Layers,12.0
Validation Accuracy,0.80881
Validation Adjacent Accuracy,0.97154
Validation F1,0.0
Validation Loss,0.86948


[I 2025-08-01 08:52:26,345] Trial 5 finished with value: 0.9150326873962199 and parameters: {'learning_rate': 0.00013353508229757926, 'batch_size': 16, 'label_smoothing': 0.08395801184157484, 'num_epochs': 15, 'warmup_ratio': 0.1053887370783656, 'weight_decay': 0.057452381309670064, 'attention_dropout': 0.31314073094885464, 'hidden_dropout': 0.37098355380967046}. Best is trial 4 with value: 0.9284241099301461.


TRIAL 6 - TESTING THESE PARAMETERS:
Learning Rate:      5.89e-05
Epochs:             13
Warmup Ratio:       0.127
Weight Decay:       0.127
Attention Dropout:  0.369
Hidden Dropout:     0.354


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,No log,1.084726,0.636218,0.631181,0.6456,0.661586,0.72148,0.723711,0.719262,0.719262,0.605435,0.602162,0.608743,0.608743,0.705372,0.681342,0.731159,0.731159,0.517749,0.577778,0.46902,0.46902,0.704805,0.643006,0.779747,0.779747,0.473261,0.908653,0.781145
2,1.369800,0.900804,0.753959,0.753529,0.787928,0.732571,0.707046,0.890966,0.586066,0.586066,0.673626,0.677348,0.669945,0.669945,0.803688,0.775916,0.833521,0.833521,0.75527,0.69357,0.82902,0.82902,0.815534,0.90184,0.744304,0.744304,0.305944,0.943998,0.855164
3,0.952100,0.831594,0.795961,0.795375,0.799752,0.80093,0.804642,0.761905,0.852459,0.852459,0.718841,0.765432,0.677596,0.677596,0.840693,0.835556,0.845894,0.845894,0.783037,0.756955,0.81098,0.81098,0.847213,0.878912,0.817722,0.817722,0.250172,0.955933,0.893436
4,0.787000,0.807022,0.826027,0.82654,0.849037,0.8112,0.794842,0.928767,0.694672,0.694672,0.782105,0.754315,0.812022,0.812022,0.864524,0.882767,0.847019,0.847019,0.817204,0.774965,0.864314,0.864314,0.869908,0.904372,0.837975,0.837975,0.211384,0.963737,0.907429
5,0.711100,0.843612,0.806059,0.805081,0.80497,0.814852,0.793767,0.718076,0.887295,0.887295,0.727273,0.796875,0.668852,0.668852,0.852071,0.898876,0.809899,0.809899,0.79893,0.779269,0.819608,0.819608,0.859241,0.831754,0.888608,0.888608,0.233417,0.962589,0.906395
6,0.632800,0.819498,0.814322,0.81525,0.822282,0.810267,0.797872,0.829646,0.768443,0.768443,0.764858,0.72549,0.808743,0.808743,0.843931,0.868014,0.821147,0.821147,0.811189,0.803695,0.818824,0.818824,0.858632,0.884564,0.834177,0.834177,0.221942,0.965573,0.907388


Epoch 1: Train Loss: 0.0000, Val Loss: 1.0847, Val F1: 0.0000, QWK: 0.7811


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 0.9008, Val F1: 0.0000, QWK: 0.8552


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 0.8316, Val F1: 0.0000, QWK: 0.8934


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 0.8070, Val F1: 0.0000, QWK: 0.9074


  return forward_call(*args, **kwargs)


Epoch 5: Train Loss: 0.0000, Val Loss: 0.8436, Val F1: 0.0000, QWK: 0.9064


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.0000, Val Loss: 0.8195, Val F1: 0.0000, QWK: 0.9074


  return forward_call(*args, **kwargs)


Epoch 6: Train Loss: 0.8528, Val Loss: 0.8195, Val F1: 0.0000, QWK: 0.9074
✅ Trial checkpoint saved: ./checkpoints_bert/trial_6/final_epoch_6.ckpt
✅ Model files saved in: ./checkpoints_bert/trial_6
📊 Trial 6 score: 0.9074 (Best: 0.9284)
GPU Memory Used: 4.13 GB


0,1
Epoch,▁▂▄▅▇██
Learning_Rate,▁▁▁▁▁▁▁
Stage,▁▁▁▁▁▁▁
Train Accuracy,▁▁▁▁▁▁▁
Train Loss,▁▁▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁▁▁
Validation Accuracy,▁▅▇█▇██
Validation Adjacent Accuracy,▁▅▇████
Validation F1,▁▁▁▁▁▁▁
Validation Loss,█▃▂▁▂▁▁

0,1
Epoch,6.0
Learning_Rate,6e-05
Stage,1.0
Train Accuracy,0.0
Train Loss,0.85279
Unfrozen_Layers,12.0
Validation Accuracy,0.81432
Validation Adjacent Accuracy,0.96557
Validation F1,0.0
Validation Loss,0.8195


[I 2025-08-01 09:00:07,564] Trial 6 finished with value: 0.9073878147560263 and parameters: {'learning_rate': 5.889698182923651e-05, 'batch_size': 64, 'label_smoothing': 0.11649037836869071, 'num_epochs': 13, 'warmup_ratio': 0.12737001868114928, 'weight_decay': 0.12661182033842114, 'attention_dropout': 0.36941769572138095, 'hidden_dropout': 0.35417641175158643}. Best is trial 4 with value: 0.9284241099301461.


TRIAL 7 - TESTING THESE PARAMETERS:
Learning Rate:      3.90e-04
Epochs:             14
Warmup Ratio:       0.060
Weight Decay:       0.146
Attention Dropout:  0.372
Hidden Dropout:     0.367


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(  # Uses ordinal loss always
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Extremely Negative,Precision Extremely Negative,Recall Extremely Negative,Accuracy Extremely Negative,F1 Negative,Precision Negative,Recall Negative,Accuracy Negative,F1 Neutral,Precision Neutral,Recall Neutral,Accuracy Neutral,F1 Positive,Precision Positive,Recall Positive,Accuracy Positive,F1 Extremely Positive,Precision Extremely Positive,Recall Extremely Positive,Accuracy Extremely Positive,Mae,Adjacent Accuracy,Quadratic Weighted Kappa
1,1.2435,1.131034,0.669038,0.665516,0.706453,0.638387,0.526316,0.811966,0.389344,0.389344,0.590832,0.612676,0.570492,0.570492,0.79954,0.816901,0.782902,0.782902,0.654071,0.581907,0.746667,0.746667,0.705658,0.708812,0.702532,0.702532,0.470966,0.892128,0.724534
2,1.0931,1.054831,0.676842,0.673687,0.712414,0.656531,0.534266,0.84141,0.391393,0.391393,0.636961,0.557929,0.742077,0.742077,0.761578,0.774419,0.749156,0.749156,0.647413,0.662562,0.632941,0.632941,0.745846,0.725749,0.767089,0.767089,0.413817,0.920358,0.799576
3,1.4312,1.659967,0.366537,0.301458,0.435534,0.412117,0.504418,0.414795,0.643443,0.643443,0.060362,0.379747,0.032787,0.032787,0.455829,0.302351,0.925759,0.925759,0.211527,0.436275,0.139608,0.139608,0.426757,0.644501,0.318987,0.318987,0.883865,0.816846,0.460083
4,1.632,1.807292,0.204039,0.069154,0.040808,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338925,0.204039,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.089282,0.706679,0.0


Epoch 1: Train Loss: 0.0000, Val Loss: 1.1310, Val F1: 0.0000, QWK: 0.7245


  return forward_call(*args, **kwargs)


Epoch 2: Train Loss: 0.0000, Val Loss: 1.0548, Val F1: 0.0000, QWK: 0.7996


  return forward_call(*args, **kwargs)


Epoch 3: Train Loss: 0.0000, Val Loss: 1.6600, Val F1: 0.0000, QWK: 0.4601


  return forward_call(*args, **kwargs)


Epoch 4: Train Loss: 0.0000, Val Loss: 1.8073, Val F1: 0.0000, QWK: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4: Train Loss: 1.2908, Val Loss: 1.8073, Val F1: 0.0000, QWK: 0.0000
✅ Trial checkpoint saved: ./checkpoints_bert/trial_7/final_epoch_4.ckpt
✅ Model files saved in: ./checkpoints_bert/trial_7
📊 Trial 7 score: 0.0000 (Best: 0.9284)
GPU Memory Used: 4.13 GB


0,1
Epoch,▁▃▆██
Learning_Rate,▁▁▁▁▁
Stage,▁▁▁▁▁
Train Accuracy,▁▁▁▁▁
Train Loss,▁▁▁▁█
Unfrozen_Layers,▁▁▁▁▁
Validation Accuracy,██▃▁▁
Validation Adjacent Accuracy,▇█▅▁▁
Validation F1,▁▁▁▁▁
Validation Loss,▂▁▇██

0,1
Epoch,4.0
Learning_Rate,0.00039
Stage,1.0
Train Accuracy,0.0
Train Loss,1.29082
Unfrozen_Layers,12.0
Validation Accuracy,0.20404
Validation Adjacent Accuracy,0.70668
Validation F1,0.0
Validation Loss,1.80729


[I 2025-08-01 09:05:49,368] Trial 7 finished with value: 0.0 and parameters: {'learning_rate': 0.0003897773320704888, 'batch_size': 32, 'label_smoothing': 0.08451958274914737, 'num_epochs': 14, 'warmup_ratio': 0.06030116339682273, 'weight_decay': 0.14619808960138364, 'attention_dropout': 0.37166988585343874, 'hidden_dropout': 0.3669526106323674}. Best is trial 4 with value: 0.9284241099301461.


Best hyperparameters saved to best_params_bertweet_base.json
Best score: 0.9284
Best params: {'learning_rate': 0.00021229985961337253, 'batch_size': 32, 'label_smoothing': 0.11524364144454653, 'num_epochs': 11, 'warmup_ratio': 0.13273796418047804, 'weight_decay': 0.1121797776236029, 'attention_dropout': 0.33099760772482156, 'hidden_dropout': 0.38867822735790303}
Completed 8/8 trials


In [22]:
import time
import os

def calculate_model_metrics(model_path="./best_model_so_far"):
    """Calculate inference time and model size"""

    # 1. Calculate Model Size
    def get_model_size_mb(path):
        total_size = 0
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                total_size += os.path.getsize(filepath)
        return total_size / (1024 * 1024)  # Convert to MB

    model_size_mb = get_model_size_mb(model_path)

    # 2. Calculate Inference Time
    def measure_inference_time():
        # Load model for timing
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        tokenizer_for_timing = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")

        if device.type == "cuda":
            model = model.to(device)
        model.eval()

        # Sample text for timing
        sample_text = "COVID vaccines are helping to reduce hospitalizations significantly."

        # Warm-up runs (don't count these)
        for _ in range(3):
            inputs = tokenizer_for_timing(sample_text, return_tensors="pt", truncation=True, padding=True)
            if device.type == "cuda":
                inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                _ = model(**inputs)

        # Actual timing runs
        times = []
        num_runs = 10

        for _ in range(num_runs):
            inputs = tokenizer_for_timing(sample_text, return_tensors="pt", truncation=True, padding=True)
            if device.type == "cuda":
                inputs = {k: v.to(device) for k, v in inputs.items()}

            start_time = time.time()
            with torch.no_grad():
                _ = model(**inputs)
            end_time = time.time()

            times.append(end_time - start_time)

        # Return average inference time
        avg_inference_time = sum(times) / len(times)
        return avg_inference_time

    inference_time = measure_inference_time()

    return model_size_mb, inference_time

# Load the best model
best_model = AutoModelForSequenceClassification.from_pretrained(best_model_path)

# Create trainer for test evaluation
test_trainer = Trainer(
    model=best_model,
    args=TrainingArguments(
        output_dir="./temp",
        per_device_eval_batch_size=16,
        remove_unused_columns=False,
    ),
    tokenizer=tokenizer,
    compute_metrics=compute_detailed_metrics,  # Use the enhanced function!
)

# Evaluate on test set
test_results = test_trainer.evaluate(test_dataset)

# Calculate performance metrics
model_size_mb, inference_time_sec = calculate_model_metrics(best_model_path)

print("FINAL TEST SET EVALUATION - COMPREHENSIVE ANALYSIS")

# Standard Classification Metrics
print(test_results)
print("\nOVERALL CLASSIFICATION METRICS:")
print(f"Accuracy:           {test_results['eval_accuracy']:.4f}")
print(f"Validation QWK:     {test_results['eval_quadratic_weighted_kappa']:.4f}")
print(f"F1:                 {test_results['eval_f1']:.4f}")
print(f"Precision-Macro:    {test_results['eval_precision']:.4f}")
print(f"Recall-Macro:       {test_results['eval_recall']:.4f}")

# Ordinal-Aware Metrics
print("\nORDINAL-AWARE METRICS:")
print(f"Mean Absolute Error:        {test_results['eval_mae']:.4f}")
print(f"Adjacent Accuracy:          {test_results['eval_adjacent_accuracy']:.4f}")
print(f"Quadratic Weighted Kappa:   {test_results['eval_quadratic_weighted_kappa']:.4f}")

# Performance Metrics
print("\nPERFORMANCE METRICS:")
print(f"Inference Time:             {inference_time_sec:.4f} sec")
print(f"Model Size:                 {model_size_mb:.1f} MB")

# Per-Class Detailed Analysis (PRESERVED)
print("\nPER-CLASS PERFORMANCE ANALYSIS:")
sentiment_classes = ["extremely_negative", "negative", "neutral", "positive", "extremely_positive"]
class_display_names = ["Extremely Negative", "Negative", "Neutral", "Positive", "Extremely Positive"]

for i, (class_key, class_name) in enumerate(zip(sentiment_classes, class_display_names)):
    print(f"\n{class_name}:")
    print(f"  F1-Score:   {test_results.get(f'eval_f1_{class_key}', 0):.4f}")
    print(f"  Precision:  {test_results.get(f'eval_precision_{class_key}', 0):.4f}")
    print(f"  Recall:     {test_results.get(f'eval_accuracy_{class_key}', 0):.4f}")
    print(f"  Accuracy:   {test_results.get(f'eval_accuracy_{class_key}', 0):.4f}")

# Performance Analysis (PRESERVED + ENHANCED)
print("\nPERFORMANCE INSIGHTS:")
print(f"• MAE {test_results['eval_mae']:.2f}: On average off by {test_results['eval_mae']:.2f} sentiment levels")
print(f"• Adjacent Accuracy {test_results['eval_adjacent_accuracy']:.1%}: Predictions within 1 sentiment level")
print(f"• QWK {test_results['eval_quadratic_weighted_kappa']:.3f}: {'Excellent' if test_results['eval_quadratic_weighted_kappa'] > 0.8 else 'Good' if test_results['eval_quadratic_weighted_kappa'] > 0.6 else 'Moderate'} ordinal agreement")
# Added inference speed and efficiency insights
print(f"• Inference Speed: {1/inference_time_sec:.1f} predictions per second")
print(f"• Model Efficiency: {model_size_mb:.1f} MB storage required")

# Class Performance Analysis (PRESERVED)
f1_scores = [test_results.get(f'eval_f1_{class_key}', 0) for class_key in sentiment_classes]
best_class_idx = f1_scores.index(max(f1_scores))
worst_class_idx = f1_scores.index(min(f1_scores))

print(f"\nCLASS-SPECIFIC INSIGHTS:")
print(f"• Best performing class: {class_display_names[best_class_idx]} (F1: {f1_scores[best_class_idx]:.4f})")
print(f"• Most challenging class: {class_display_names[worst_class_idx]} (F1: {f1_scores[worst_class_idx]:.4f})")

# COVID-specific insights (PRESERVED)
extreme_avg = (f1_scores[0] + f1_scores[4]) / 2  # extremely negative + extremely positive
moderate_avg = (f1_scores[1] + f1_scores[3]) / 2  # negative + positive
neutral_score = f1_scores[2]

print(f"\nCOVID SENTIMENT INSIGHTS:")
print(f"• Extreme emotions (avg F1: {extreme_avg:.3f}): {'Challenging' if extreme_avg < 0.7 else 'Well-handled'}")
print(f"• Moderate emotions (avg F1: {moderate_avg:.3f}): {'Needs work' if moderate_avg < 0.75 else 'Good performance'}")
print(f"• Neutral sentiment (F1: {neutral_score:.3f}): {'Difficult to detect' if neutral_score < 0.8 else 'Well-identified'}")

# Final Summary with Key Metrics
print(f"\n📊 FINAL SUMMARY:")
print(f"F1: {test_results['eval_f1']:.4f} | QWK: {test_results['eval_quadratic_weighted_kappa']:.4f} | Inference: {inference_time_sec:.4f}s | Size: {model_size_mb:.1f}MB")

  test_trainer = Trainer(
  return forward_call(*args, **kwargs)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  return forward_call(*args, **kwargs)


FINAL TEST SET EVALUATION - COMPREHENSIVE ANALYSIS
{'eval_loss': 0.6327186822891235, 'eval_model_preparation_time': 0.0037, 'eval_accuracy': 0.8233060747663551, 'eval_f1': 0.8223617028680927, 'eval_precision': 0.8209153033370565, 'eval_recall': 0.8357680469803682, 'eval_f1_extremely_negative': 0.8286384976525821, 'eval_precision_extremely_negative': 0.7986425339366516, 'eval_recall_extremely_negative': 0.8609756097560975, 'eval_accuracy_extremely_negative': 0.8609756097560975, 'eval_f1_negative': 0.7915869980879541, 'eval_precision_negative': 0.8214285714285714, 'eval_recall_negative': 0.7638376383763837, 'eval_accuracy_negative': 0.7638376383763837, 'eval_f1_neutral': 0.8396624472573839, 'eval_precision_neutral': 0.7824377457404981, 'eval_recall_neutral': 0.9059180576631259, 'eval_accuracy_neutral': 0.9059180576631259, 'eval_f1_positive': 0.8100278551532033, 'eval_precision_positive': 0.8644470868014269, 'eval_recall_positive': 0.7620545073375262, 'eval_accuracy_positive': 0.762054507