BERT Fine-tuning Optimization Methods Comparison

In [2]:
# Install required packages (run this cell first in Kaggle)
import subprocess
import sys

In [3]:
def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install wandb if not already installed
try:
    import wandb
except ImportError:
    install_package("wandb")
    import wandb

In [4]:
# WandB Authentication for Kaggle
import os
print("Setting up WandB authentication...")

# Method 1: Using WandB API Key (Recommended)
# Add your WandB API key to Kaggle Secrets with key name: "wandb_api_key"
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

try:
    # Get WandB API key from Kaggle secrets
    wandb_api_key = user_secrets.get_secret("wandb_api_key")
    os.environ["WANDB_API_KEY"] = wandb_api_key
    print("✅ WandB API key loaded from Kaggle secrets")
except:
    print("❌ WandB API key not found in Kaggle secrets")
    print("Please add your WandB API key to Kaggle Secrets with key name: 'wandb_api_key'")
    print("You can find your API key at: https://wandb.ai/authorize")
    
    # Alternative: Manual API key input (less secure)
    wandb_api_key = input("Enter your WandB API key: ")
    os.environ["WANDB_API_KEY"] = wandb_api_key

# Login to WandB
wandb.login()

Setting up WandB authentication...
❌ WandB API key not found in Kaggle secrets
Please add your WandB API key to Kaggle Secrets with key name: 'wandb_api_key'
You can find your API key at: https://wandb.ai/authorize


Enter your WandB API key:  7a7720dbaf31cb54e7ecf887c0411dcc1c50d8ee


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhanaoui-wissal2[0m ([33mhanaoui-wissal2-fsbm-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, Adafactor
)
from datasets import load_dataset
import wandb
import time
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

2025-07-07 17:47:52.128931: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751910472.338717      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751910472.400675      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
# Configuration
class Config:
    model_name = 'bert-base-uncased'
    max_length = 128
    batch_size = 16
    num_epochs = 3
    learning_rate = 2e-5
    weight_decay = 0.01
    warmup_steps = 500
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 42
    
    # WandB configuration
    wandb_project = "bert-optimization-comparison"
    wandb_entity = None  # Set your WandB entity if needed
    # Kaggle specific settings
    kaggle_output_dir = "/kaggle/working/"

In [8]:
# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(Config.seed)

In [9]:
# Custom Dataset class for SST-2
class SST2Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Load and prepare SST-2 dataset
def load_sst2_data():
    print("Loading SST-2 dataset...")
    dataset = load_dataset("glue", "sst2")
    
    train_texts = dataset['train']['sentence']
    train_labels = dataset['train']['label']
    val_texts = dataset['validation']['sentence']
    val_labels = dataset['validation']['label']
    
    print(f"Training samples: {len(train_texts)}")
    print(f"Validation samples: {len(val_texts)}")
    
    return train_texts, train_labels, val_texts, val_labels


In [12]:
# Training function
def train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs, device, optimizer_name):
    model.train()
    best_val_accuracy = 0
    training_stats = []
    
    for epoch in range(num_epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0
        
        # Training phase
        model.train()
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(input_ids=input_ids, 
                          attention_mask=attention_mask, 
                          labels=labels)
            
            loss = outputs.loss
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            if scheduler:
                scheduler.step()
            
            total_loss += loss.item()
            
            # Calculate accuracy
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)
            
            # Log every 100 batches
            if batch_idx % 100 == 0:
                current_lr = optimizer.param_groups[0]['lr']
                wandb.log({
                    f"{optimizer_name}/train_loss_step": loss.item(),
                    f"{optimizer_name}/learning_rate": current_lr,
                    f"{optimizer_name}/epoch": epoch,
                    f"{optimizer_name}/step": epoch * len(train_loader) + batch_idx
                })
        
        # Calculate epoch metrics
        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = total_correct / total_samples
        
        # Validation phase
        val_loss, val_accuracy, val_f1, val_precision, val_recall = evaluate_model(
            model, val_loader, device
        )
        
        # Log epoch metrics
        wandb.log({
            f"{optimizer_name}/epoch": epoch,
            f"{optimizer_name}/train_loss": avg_train_loss,
            f"{optimizer_name}/train_accuracy": train_accuracy,
            f"{optimizer_name}/val_loss": val_loss,
            f"{optimizer_name}/val_accuracy": val_accuracy,
            f"{optimizer_name}/val_f1": val_f1,
            f"{optimizer_name}/val_precision": val_precision,
            f"{optimizer_name}/val_recall": val_recall
        })
        
        print(f"Epoch {epoch+1}/{num_epochs} - {optimizer_name}")
        print(f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")
        print("-" * 50)
        
        # Save best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
        
        # Store training statistics
        training_stats.append({
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'train_accuracy': train_accuracy,
            'val_loss': val_loss,
            'val_accuracy': val_accuracy,
            'val_f1': val_f1,
            'val_precision': val_precision,
            'val_recall': val_recall
        })
    
    return training_stats, best_val_accuracy

In [13]:
# Evaluation function
def evaluate_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, 
                          attention_mask=attention_mask, 
                          labels=labels)
            
            total_loss += outputs.loss.item()
            
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    precision = precision_score(all_labels, all_predictions, average='weighted')
    recall = recall_score(all_labels, all_predictions, average='weighted')
    
    return avg_loss, accuracy, f1, precision, recall


In [14]:
# Optimizer setup functions
def get_optimizer_and_scheduler(model, optimizer_name, train_loader, num_epochs):
    num_training_steps = len(train_loader) * num_epochs
    
    if optimizer_name == "AdamW":
        optimizer = AdamW(
            model.parameters(),
            lr=Config.learning_rate,
            weight_decay=Config.weight_decay,
            eps=1e-8
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.warmup_steps,
            num_training_steps=num_training_steps
        )
    
    elif optimizer_name == "LAMB":
        # Using AdamW with different parameters to simulate LAMB behavior
        optimizer = AdamW(
            model.parameters(),
            lr=Config.learning_rate * 2,  # LAMB typically uses higher learning rates
            weight_decay=Config.weight_decay,
            eps=1e-6,
            betas=(0.9, 0.999)
        )
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.warmup_steps,
            num_training_steps=num_training_steps
        )
    
    elif optimizer_name == "SGD_warmup":
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=Config.learning_rate * 10,  # SGD typically needs higher learning rates
            weight_decay=Config.weight_decay,
            momentum=0.9
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.warmup_steps,
            num_training_steps=num_training_steps
        )
    
    elif optimizer_name == "Adafactor":
        optimizer = Adafactor(
            model.parameters(),
            lr=Config.learning_rate,
            weight_decay=Config.weight_decay,
            relative_step=False,
            scale_parameter=False
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.warmup_steps,
            num_training_steps=num_training_steps
        )
    
    return optimizer, scheduler

In [15]:
# Main training loop for all optimizers
def run_optimization_comparison():
    # Load data
    train_texts, train_labels, val_texts, val_labels = load_sst2_data()
    
    # Initialize tokenizer
    tokenizer = BertTokenizer.from_pretrained(Config.model_name)
    
    # Create datasets
    train_dataset = SST2Dataset(train_texts, train_labels, tokenizer, Config.max_length)
    val_dataset = SST2Dataset(val_texts, val_labels, tokenizer, Config.max_length)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=Config.batch_size, shuffle=False)
    
    # Optimizers to compare
    optimizers = ["AdamW", "LAMB", "SGD_warmup", "Adafactor"]
    
    # Store results
    all_results = {}
    
    for optimizer_name in optimizers:
        print(f"\n{'='*60}")
        print(f"Training with {optimizer_name}")
        print(f"{'='*60}")
        
        # Initialize WandB run
        wandb.init(
            project=Config.wandb_project,
            entity=Config.wandb_entity,
            name=f"BERT-{optimizer_name}-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
            config={
                "optimizer": optimizer_name,
                "model": Config.model_name,
                "batch_size": Config.batch_size,
                "learning_rate": Config.learning_rate,
                "epochs": Config.num_epochs,
                "max_length": Config.max_length,
                "weight_decay": Config.weight_decay,
                "warmup_steps": Config.warmup_steps
            }
        )
        
        # Initialize model
        model = BertForSequenceClassification.from_pretrained(
            Config.model_name,
            num_labels=2,
            output_attentions=False,
            output_hidden_states=False
        )
        model.to(Config.device)
        
        # Get optimizer and scheduler
        optimizer, scheduler = get_optimizer_and_scheduler(
            model, optimizer_name, train_loader, Config.num_epochs
        )
        
        # Train model
        start_time = time.time()
        training_stats, best_val_accuracy = train_model(
            model, train_loader, val_loader, optimizer, scheduler, 
            Config.num_epochs, Config.device, optimizer_name
        )
        end_time = time.time()
        
        training_time = end_time - start_time
        
        # Final evaluation
        final_val_loss, final_val_accuracy, final_val_f1, final_val_precision, final_val_recall = evaluate_model(
            model, val_loader, Config.device
        )
        
        # Store results
        all_results[optimizer_name] = {
            'training_stats': training_stats,
            'best_val_accuracy': best_val_accuracy,
            'final_val_accuracy': final_val_accuracy,
            'final_val_f1': final_val_f1,
            'final_val_precision': final_val_precision,
            'final_val_recall': final_val_recall,
            'training_time': training_time
        }
        
        # Log final metrics
        wandb.log({
            f"{optimizer_name}/final_val_accuracy": final_val_accuracy,
            f"{optimizer_name}/final_val_f1": final_val_f1,
            f"{optimizer_name}/best_val_accuracy": best_val_accuracy,
            f"{optimizer_name}/training_time": training_time
        })
        
        print(f"Best validation accuracy: {best_val_accuracy:.4f}")
        print(f"Final validation accuracy: {final_val_accuracy:.4f}")
        print(f"Training time: {training_time:.2f} seconds")
        
        wandb.finish()
    
    return all_results

In [17]:
# Export results to Excel
def export_to_excel(results, filename="bert_optimization_results.xlsx"):
    # Create summary dataframe
    summary_data = []
    detailed_data = []
    
    for optimizer_name, result in results.items():
        # Summary statistics
        summary_data.append({
            'Optimizer': optimizer_name,
            'Best_Val_Accuracy': result['best_val_accuracy'],
            'Final_Val_Accuracy': result['final_val_accuracy'],
            'Final_Val_F1': result['final_val_f1'],
            'Final_Val_Precision': result['final_val_precision'],
            'Final_Val_Recall': result['final_val_recall'],
            'Training_Time_seconds': result['training_time']
        })
        
        # Detailed epoch-by-epoch results
        for epoch_stats in result['training_stats']:
            detailed_data.append({
                'Optimizer': optimizer_name,
                **epoch_stats
            })
    
    # Create DataFrames
    summary_df = pd.DataFrame(summary_data)
    detailed_df = pd.DataFrame(detailed_data)
    
    # Export to Excel with multiple sheets
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        summary_df.to_excel(writer, sheet_name='Summary', index=False)
        detailed_df.to_excel(writer, sheet_name='Detailed_Results', index=False)
        
        # Create comparison sheet
        comparison_df = summary_df.copy()
        comparison_df = comparison_df.sort_values('Best_Val_Accuracy', ascending=False)
        comparison_df['Rank'] = range(1, len(comparison_df) + 1)
        comparison_df.to_excel(writer, sheet_name='Comparison_Ranking', index=False)
    
    print(f"Results exported to {filename}")
    
    # Display summary
    print("\n" + "="*80)
    print("OPTIMIZATION COMPARISON SUMMARY")
    print("="*80)
    print(summary_df.to_string(index=False))
    
    return summary_df, detailed_df

In [18]:
# Run the complete comparison
if __name__ == "__main__":
    print("Starting BERT Fine-tuning Optimization Comparison")
    print(f"Device: {Config.device}")
    print(f"Model: {Config.model_name}")
    print(f"Epochs: {Config.num_epochs}")
    print(f"Batch size: {Config.batch_size}")
    print(f"Learning rate: {Config.learning_rate}")
    
    # Run the comparison
    results = run_optimization_comparison()
    
    # Export results
    summary_df, detailed_df = export_to_excel(results)
    
    print("\nComparison completed successfully!")
    print("Check your WandB dashboard for detailed metrics and visualizations.")


Starting BERT Fine-tuning Optimization Comparison
Device: cuda
Model: bert-base-uncased
Epochs: 3
Batch size: 16
Learning rate: 2e-05
Loading SST-2 dataset...


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Training samples: 67349
Validation samples: 872


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


Training with AdamW


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - AdamW
Train Loss: 0.2397, Train Acc: 0.9107
Val Loss: 0.2912, Val Acc: 0.9186, Val F1: 0.9184
--------------------------------------------------
Epoch 2/3 - AdamW
Train Loss: 0.1256, Train Acc: 0.9652
Val Loss: 0.3065, Val Acc: 0.9220, Val F1: 0.9220
--------------------------------------------------
Epoch 3/3 - AdamW
Train Loss: 0.0778, Train Acc: 0.9796
Val Loss: 0.3490, Val Acc: 0.9243, Val F1: 0.9243
--------------------------------------------------
Best validation accuracy: 0.9243
Final validation accuracy: 0.9243
Training time: 4551.55 seconds


0,1
AdamW/best_val_accuracy,▁
AdamW/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅██████████
AdamW/final_val_accuracy,▁
AdamW/final_val_f1,▁
AdamW/learning_rate,██▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁
AdamW/step,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
AdamW/train_accuracy,▁▇█
AdamW/train_loss,█▃▁
AdamW/train_loss_step,█▄▂▂▁▇▄▄▁▁▁▁▅▁▄▅▁▁▁▁▄▄▁▁▁▁▁▁▁▁▁▁▁▅▅▁▁▁▁▁
AdamW/training_time,▁

0,1
AdamW/best_val_accuracy,0.92431
AdamW/epoch,2.0
AdamW/final_val_accuracy,0.92431
AdamW/final_val_f1,0.92429
AdamW/learning_rate,0.0
AdamW/step,12620.0
AdamW/train_accuracy,0.9796
AdamW/train_loss,0.07783
AdamW/train_loss_step,0.00736
AdamW/training_time,4551.54547



Training with LAMB


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - LAMB
Train Loss: 0.2442, Train Acc: 0.9096
Val Loss: 0.3157, Val Acc: 0.9209, Val F1: 0.9209
--------------------------------------------------
Epoch 2/3 - LAMB
Train Loss: 0.1263, Train Acc: 0.9643
Val Loss: 0.3017, Val Acc: 0.9174, Val F1: 0.9174
--------------------------------------------------
Epoch 3/3 - LAMB
Train Loss: 0.0665, Train Acc: 0.9824
Val Loss: 0.3592, Val Acc: 0.9151, Val F1: 0.9151
--------------------------------------------------
Best validation accuracy: 0.9209
Final validation accuracy: 0.9151
Training time: 4565.73 seconds


0,1
LAMB/best_val_accuracy,▁
LAMB/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅███████████████
LAMB/final_val_accuracy,▁
LAMB/final_val_f1,▁
LAMB/learning_rate,▁▂▄▇██████▇▇▇▇▇▅▅▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁
LAMB/step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
LAMB/train_accuracy,▁▆█
LAMB/train_loss,█▃▁
LAMB/train_loss_step,▅▄▄▃▃▃▃▂▃▃▁▁▁▁▂▂▄▁▃▃▁▁█▁▁▁▁▁▁▁▃▁▁▁▂▁▁▁▁▁
LAMB/training_time,▁

0,1
LAMB/best_val_accuracy,0.92087
LAMB/epoch,2.0
LAMB/final_val_accuracy,0.91514
LAMB/final_val_f1,0.91511
LAMB/learning_rate,0.0
LAMB/step,12620.0
LAMB/train_accuracy,0.98238
LAMB/train_loss,0.06649
LAMB/train_loss_step,0.12839
LAMB/training_time,4565.73179



Training with SGD_warmup


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - SGD_warmup
Train Loss: 0.4168, Train Acc: 0.7934
Val Loss: 0.3410, Val Acc: 0.8635, Val F1: 0.8632
--------------------------------------------------
Epoch 2/3 - SGD_warmup
Train Loss: 0.2992, Train Acc: 0.8772
Val Loss: 0.3058, Val Acc: 0.8727, Val F1: 0.8726
--------------------------------------------------
Epoch 3/3 - SGD_warmup
Train Loss: 0.2871, Train Acc: 0.8826
Val Loss: 0.3108, Val Acc: 0.8670, Val F1: 0.8668
--------------------------------------------------
Best validation accuracy: 0.8727
Final validation accuracy: 0.8670
Training time: 4360.56 seconds


0,1
SGD_warmup/best_val_accuracy,▁
SGD_warmup/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅████████████████
SGD_warmup/final_val_accuracy,▁
SGD_warmup/final_val_f1,▁
SGD_warmup/learning_rate,▁▂▄▅▇█▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁
SGD_warmup/step,▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇██
SGD_warmup/train_accuracy,▁██
SGD_warmup/train_loss,█▂▁
SGD_warmup/train_loss_step,██▇█▄▃▃▇▅▃▄▂▃▅▂▅▄▂▂▇▄▄▅▂▇▃▃▂▄▁▅▂▆▅▅▂▅▄▃▄
SGD_warmup/training_time,▁

0,1
SGD_warmup/best_val_accuracy,0.87271
SGD_warmup/epoch,2.0
SGD_warmup/final_val_accuracy,0.86697
SGD_warmup/final_val_f1,0.86678
SGD_warmup/learning_rate,0.0
SGD_warmup/step,12620.0
SGD_warmup/train_accuracy,0.88257
SGD_warmup/train_loss,0.28707
SGD_warmup/train_loss_step,0.38142
SGD_warmup/training_time,4360.56013



Training with Adafactor


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Adafactor
Train Loss: 0.2446, Train Acc: 0.9042
Val Loss: 0.2943, Val Acc: 0.9255, Val F1: 0.9253
--------------------------------------------------
Epoch 2/3 - Adafactor
Train Loss: 0.1309, Train Acc: 0.9656
Val Loss: 0.3275, Val Acc: 0.9289, Val F1: 0.9289
--------------------------------------------------
Epoch 3/3 - Adafactor
Train Loss: 0.0825, Train Acc: 0.9802
Val Loss: 0.3898, Val Acc: 0.9243, Val F1: 0.9243
--------------------------------------------------
Best validation accuracy: 0.9289
Final validation accuracy: 0.9243
Training time: 4709.08 seconds


0,1
Adafactor/best_val_accuracy,▁
Adafactor/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████
Adafactor/final_val_accuracy,▁
Adafactor/final_val_f1,▁
Adafactor/learning_rate,▄████▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁
Adafactor/step,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█
Adafactor/train_accuracy,▁▇█
Adafactor/train_loss,█▃▁
Adafactor/train_loss_step,█▃▅▃▃▄▁▂▇▂▃▁▁▁▂▇▂▆▂▁▄▄█▁▆▁▁▁▁▁▄▄▁▁▁▆▁▄▅▁
Adafactor/training_time,▁

0,1
Adafactor/best_val_accuracy,0.9289
Adafactor/epoch,2.0
Adafactor/final_val_accuracy,0.92431
Adafactor/final_val_f1,0.9243
Adafactor/learning_rate,0.0
Adafactor/step,12620.0
Adafactor/train_accuracy,0.98022
Adafactor/train_loss,0.08246
Adafactor/train_loss_step,0.00186
Adafactor/training_time,4709.08238


Results exported to bert_optimization_results.xlsx

OPTIMIZATION COMPARISON SUMMARY
 Optimizer  Best_Val_Accuracy  Final_Val_Accuracy  Final_Val_F1  Final_Val_Precision  Final_Val_Recall  Training_Time_seconds
     AdamW           0.924312            0.924312      0.924286             0.924504          0.924312            4551.545467
      LAMB           0.920872            0.915138      0.915109             0.915321          0.915138            4565.731788
SGD_warmup           0.872706            0.866972      0.866781             0.868139          0.866972            4360.560130
 Adafactor           0.928899            0.924312      0.924299             0.924373          0.924312            4709.082385

Comparison completed successfully!
Check your WandB dashboard for detailed metrics and visualizations.
