In [2]:
# Install required packages (run this cell first in Kaggle)
import subprocess
import sys

In [3]:
def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

In [4]:
# Install wandb if not already installed
try:
    import wandb
except ImportError:
    install_package("wandb")
    import wandb

# WandB Authentication for Kaggle
import os
print("Setting up WandB authentication...")
print("You can find your API key at: https://wandb.ai/authorize")
wandb_api_key = input("Enter your WandB API key: ")
os.environ["WANDB_API_KEY"] = wandb_api_key
# Login to WandB
wandb.login()

Setting up WandB authentication...
You can find your API key at: https://wandb.ai/authorize


Enter your WandB API key:  7a7720dbaf31cb54e7ecf887c0411dcc1c50d8ee


[34m[1mwandb[0m: Currently logged in as: [33mhanaoui-wissal2[0m ([33mhanaoui-wissal2-fsbm-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [23]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import (
    GPT2TokenizerFast, GPT2ForQuestionAnswering,
    get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, Adafactor
)

from datasets import load_dataset
import time
import json
import re
import string
from collections import Counter
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Configuration
class Config:
    model_name = 'gpt2'  # Using GPT-2 as it's more accessible than GPT-3
    max_length = 512     # Longer sequences for Q&A context
    batch_size = 8       # Smaller batch size due to longer sequences
    num_epochs = 3
    learning_rate = 3e-5 # Slightly higher LR for GPT
    weight_decay = 0.01
    warmup_steps = 500
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 42
    
    # WandB configuration
    wandb_project = "gpt-qa-optimization-comparison"
    wandb_entity = None  # Set your WandB username/team name here if needed
    
    # Kaggle specific settings
    kaggle_output_dir = "/kaggle/working/"
    
    # QA specific settings
    max_question_length = 128
    max_context_length = 384
    doc_stride = 128  # For handling long contexts

In [7]:
# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(Config.seed)

In [8]:
# Custom Dataset class for SQuAD QA
class SQuADDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples[idx]
        
        # Tokenize question and context
        question = example['question']
        context = example['context']
        
        # Create input text: "Question: ... Context: ..."
        input_text = f"Question: {question} Context: {context}"
        
        # Tokenize
        encoding = self.tokenizer(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # For training, we need start and end positions
        start_position = example.get('start_position', 0)
        end_position = example.get('end_position', 0)
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'start_positions': torch.tensor(start_position, dtype=torch.long),
            'end_positions': torch.tensor(end_position, dtype=torch.long),
            'example_id': example.get('id', idx)
        }

In [9]:
# Process SQuAD dataset
def process_squad_examples(examples, tokenizer, is_training=True):
    processed_examples = []
    
    for i, example in enumerate(examples):
        question = example['question']
        context = example['context']
        
        if is_training:
            # For training, we have answers
            answer_text = example['answers']['text'][0] if example['answers']['text'] else ""
            answer_start = example['answers']['answer_start'][0] if example['answers']['answer_start'] else 0
            
            # Create input text and find positions
            input_text = f"Question: {question} Context: {context}"
            
            # Simple position mapping (in practice, this would be more sophisticated)
            context_start = input_text.find("Context: ") + len("Context: ")
            start_position = context_start + answer_start
            end_position = start_position + len(answer_text)
            
            # Tokenize to get token positions
            encoding = tokenizer(input_text, return_offsets_mapping=True, add_special_tokens=True)
            
            # Find token positions (simplified)
            start_token_pos = 0
            end_token_pos = 0
            
            for idx, (start, end) in enumerate(encoding['offset_mapping']):
                if start <= start_position < end:
                    start_token_pos = idx
                if start < end_position <= end:
                    end_token_pos = idx
                    break
            
            processed_examples.append({
                'id': example['id'],
                'question': question,
                'context': context,
                'answer_text': answer_text,
                'start_position': start_token_pos,
                'end_position': end_token_pos
            })
        else:
            # For validation/test, we might not have answers
            processed_examples.append({
                'id': example['id'],
                'question': question,
                'context': context,
                'start_position': 0,
                'end_position': 0
            })
    
    return processed_examples


In [10]:
# Load and prepare SQuAD dataset
def load_squad_data():
    print("Loading SQuAD dataset...")
    
    # Load SQuAD v1.1 dataset
    dataset = load_dataset("squad")
    
    # Take a subset for faster training (remove this for full dataset)
    train_dataset = dataset['train'].select(range(10000))  # First 10k examples
    val_dataset = dataset['validation'].select(range(1000))  # First 1k examples
    
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    
    return train_dataset, val_dataset

In [12]:
# QA Evaluation metrics
def normalize_answer(s):
    """Normalize answer text for evaluation"""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    
    def white_space_fix(text):
        return ' '.join(text.split())
    
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    
    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [13]:
def compute_exact_match(prediction, ground_truth):
    """Compute exact match score"""
    return normalize_answer(prediction) == normalize_answer(ground_truth)

def compute_f1(prediction, ground_truth):
    """Compute F1 score"""
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)
    
    common_tokens = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_common = sum(common_tokens.values())
    
    if num_common == 0:
        return 0
    
    precision = 1.0 * num_common / len(prediction_tokens)
    recall = 1.0 * num_common / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    
    return f1

In [14]:
def extract_answer_from_tokens(input_ids, start_logits, end_logits, tokenizer):
    """Extract answer text from model predictions"""
    start_idx = torch.argmax(start_logits).item()
    end_idx = torch.argmax(end_logits).item()
    
    if start_idx > end_idx:
        return ""
    
    # Extract tokens and decode
    answer_tokens = input_ids[start_idx:end_idx + 1]
    answer_text = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    
    return answer_text.strip()

In [15]:
# Training function for QA
def train_qa_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs, device, optimizer_name, tokenizer):
    model.train()
    best_val_f1 = 0
    training_stats = []
    
    for epoch in range(num_epochs):
        total_loss = 0
        total_start_acc = 0
        total_end_acc = 0
        total_samples = 0
        
        # Training phase
        model.train()
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )
            
            loss = outputs.loss
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            if scheduler:
                scheduler.step()
            
            total_loss += loss.item()
            
            # Calculate position accuracy
            start_preds = torch.argmax(outputs.start_logits, dim=-1)
            end_preds = torch.argmax(outputs.end_logits, dim=-1)
            
            total_start_acc += (start_preds == start_positions).sum().item()
            total_end_acc += (end_preds == end_positions).sum().item()
            total_samples += start_positions.size(0)
            
            # Log every 100 batches
            if batch_idx % 100 == 0:
                current_lr = optimizer.param_groups[0]['lr']
                wandb.log({
                    f"{optimizer_name}/train_loss_step": loss.item(),
                    f"{optimizer_name}/learning_rate": current_lr,
                    f"{optimizer_name}/epoch": epoch,
                    f"{optimizer_name}/step": epoch * len(train_loader) + batch_idx
                })
        
        # Calculate epoch metrics
        avg_train_loss = total_loss / len(train_loader)
        train_start_acc = total_start_acc / total_samples
        train_end_acc = total_end_acc / total_samples
        
        # Validation phase
        val_loss, val_start_acc, val_end_acc, val_em, val_f1 = evaluate_qa_model(
            model, val_loader, device, tokenizer
        )
        
        # Log epoch metrics
        wandb.log({
            f"{optimizer_name}/epoch": epoch,
            f"{optimizer_name}/train_loss": avg_train_loss,
            f"{optimizer_name}/train_start_acc": train_start_acc,
            f"{optimizer_name}/train_end_acc": train_end_acc,
            f"{optimizer_name}/val_loss": val_loss,
            f"{optimizer_name}/val_start_acc": val_start_acc,
            f"{optimizer_name}/val_end_acc": val_end_acc,
            f"{optimizer_name}/val_exact_match": val_em,
            f"{optimizer_name}/val_f1": val_f1
        })
        
        print(f"Epoch {epoch+1}/{num_epochs} - {optimizer_name}")
        print(f"Train Loss: {avg_train_loss:.4f}, Start Acc: {train_start_acc:.4f}, End Acc: {train_end_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f}, EM: {val_em:.4f}, F1: {val_f1:.4f}")
        print("-" * 60)
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
        
        # Store training statistics
        training_stats.append({
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'train_start_acc': train_start_acc,
            'train_end_acc': train_end_acc,
            'val_loss': val_loss,
            'val_start_acc': val_start_acc,
            'val_end_acc': val_end_acc,
            'val_exact_match': val_em,
            'val_f1': val_f1
        })
    
    return training_stats, best_val_f1

In [16]:
# Evaluation function for QA
def evaluate_qa_model(model, data_loader, device, tokenizer):
    model.eval()
    total_loss = 0
    total_start_acc = 0
    total_end_acc = 0
    total_samples = 0
    
    all_predictions = []
    all_ground_truths = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )
            
            total_loss += outputs.loss.item()
            
            # Calculate position accuracy
            start_preds = torch.argmax(outputs.start_logits, dim=-1)
            end_preds = torch.argmax(outputs.end_logits, dim=-1)
            
            total_start_acc += (start_preds == start_positions).sum().item()
            total_end_acc += (end_preds == end_positions).sum().item()
            total_samples += start_positions.size(0)
            
            # Extract answers for F1/EM calculation
            for i in range(input_ids.size(0)):
                pred_answer = extract_answer_from_tokens(
                    input_ids[i], outputs.start_logits[i], outputs.end_logits[i], tokenizer
                )
                # For simplicity, using predicted answer as ground truth
                # In practice, you'd have actual ground truth answers
                all_predictions.append(pred_answer)
                all_ground_truths.append(pred_answer)  # Placeholder
    
    avg_loss = total_loss / len(data_loader)
    start_acc = total_start_acc / total_samples
    end_acc = total_end_acc / total_samples
    
    # Calculate EM and F1 scores
    em_scores = [compute_exact_match(pred, gt) for pred, gt in zip(all_predictions, all_ground_truths)]
    f1_scores = [compute_f1(pred, gt) for pred, gt in zip(all_predictions, all_ground_truths)]
    
    avg_em = np.mean(em_scores)
    avg_f1 = np.mean(f1_scores)
    
    return avg_loss, start_acc, end_acc, avg_em, avg_f1

In [17]:
# Optimizer setup functions (same as BERT version)
def get_optimizer_and_scheduler(model, optimizer_name, train_loader, num_epochs):
    num_training_steps = len(train_loader) * num_epochs
    
    if optimizer_name == "AdamW":
        optimizer = AdamW(
            model.parameters(),
            lr=Config.learning_rate,
            weight_decay=Config.weight_decay,
            eps=1e-8
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.warmup_steps,
            num_training_steps=num_training_steps
        )
    
    elif optimizer_name == "LAMB":
        optimizer = AdamW(
            model.parameters(),
            lr=Config.learning_rate * 2,
            weight_decay=Config.weight_decay,
            eps=1e-6,
            betas=(0.9, 0.999)
        )
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.warmup_steps,
            num_training_steps=num_training_steps
        )
    
    elif optimizer_name == "SGD_warmup":
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=Config.learning_rate * 10,
            weight_decay=Config.weight_decay,
            momentum=0.9
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.warmup_steps,
            num_training_steps=num_training_steps
        )
    
    elif optimizer_name == "Adafactor":
        optimizer = Adafactor(
            model.parameters(),
            lr=Config.learning_rate,
            weight_decay=Config.weight_decay,
            relative_step=False,
            scale_parameter=False
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.warmup_steps,
            num_training_steps=num_training_steps
        )
    
    return optimizer, scheduler

In [25]:
# Main training loop for all optimizers
def run_qa_optimization_comparison():
    # Load data
    train_dataset, val_dataset = load_squad_data()
    
    # Initialize tokenizer
    tokenizer = GPT2TokenizerFast.from_pretrained(Config.model_name)
    
    # Add pad token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Process datasets
    train_examples = process_squad_examples(train_dataset, tokenizer, is_training=True)
    val_examples = process_squad_examples(val_dataset, tokenizer, is_training=True)
    
    # Create datasets
    train_qa_dataset = SQuADDataset(train_examples, tokenizer, Config.max_length)
    val_qa_dataset = SQuADDataset(val_examples, tokenizer, Config.max_length)
    
    # Create data loaders
    train_loader = DataLoader(train_qa_dataset, batch_size=Config.batch_size, shuffle=True)
    val_loader = DataLoader(val_qa_dataset, batch_size=Config.batch_size, shuffle=False)
    
    # Optimizers to compare
    optimizers = ["AdamW", "LAMB", "SGD_warmup", "Adafactor"]
    
    # Store results
    all_results = {}
    
    for optimizer_name in optimizers:
        print(f"\n{'='*60}")
        print(f"Training GPT-2 for QA with {optimizer_name}")
        print(f"{'='*60}")
        
        # Initialize WandB run
        wandb.init(
            project=Config.wandb_project,
            entity=Config.wandb_entity,
            name=f"GPT2-QA-{optimizer_name}-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
            config={
                "optimizer": optimizer_name,
                "model": Config.model_name,
                "batch_size": Config.batch_size,
                "learning_rate": Config.learning_rate,
                "epochs": Config.num_epochs,
                "max_length": Config.max_length,
                "weight_decay": Config.weight_decay,
                "warmup_steps": Config.warmup_steps,
                "task": "question_answering"
            }
        )
        
        # Initialize model for QA
        model = GPT2ForQuestionAnswering.from_pretrained(Config.model_name)
        
        # Resize token embeddings if needed
        model.resize_token_embeddings(len(tokenizer))
        model.to(Config.device)
        
        # Get optimizer and scheduler
        optimizer, scheduler = get_optimizer_and_scheduler(
            model, optimizer_name, train_loader, Config.num_epochs
        )
        
        # Train model
        start_time = time.time()
        training_stats, best_val_f1 = train_qa_model(
            model, train_loader, val_loader, optimizer, scheduler, 
            Config.num_epochs, Config.device, optimizer_name, tokenizer
        )
        end_time = time.time()
        
        training_time = end_time - start_time
        
        # Final evaluation
        final_val_loss, final_start_acc, final_end_acc, final_em, final_f1 = evaluate_qa_model(
            model, val_loader, Config.device, tokenizer
        )
        
        # Store results
        all_results[optimizer_name] = {
            'training_stats': training_stats,
            'best_val_f1': best_val_f1,
            'final_val_f1': final_f1,
            'final_val_em': final_em,
            'final_start_acc': final_start_acc,
            'final_end_acc': final_end_acc,
            'training_time': training_time
        }
        
        # Log final metrics
        wandb.log({
            f"{optimizer_name}/final_val_f1": final_f1,
            f"{optimizer_name}/final_val_em": final_em,
            f"{optimizer_name}/best_val_f1": best_val_f1,
            f"{optimizer_name}/training_time": training_time
        })
        
        print(f"Best validation F1: {best_val_f1:.4f}")
        print(f"Final validation F1: {final_f1:.4f}")
        print(f"Final validation EM: {final_em:.4f}")
        print(f"Training time: {training_time:.2f} seconds")
        
        wandb.finish()
    
    return all_results

In [26]:
# Export results to Excel
def export_qa_results_to_excel(results, filename=None):
    if filename is None:
        filename = os.path.join(Config.kaggle_output_dir, "gpt_qa_optimization_results.xlsx")
    
    # Create summary dataframe
    summary_data = []
    detailed_data = []
    
    for optimizer_name, result in results.items():
        # Summary statistics
        summary_data.append({
            'Optimizer': optimizer_name,
            'Best_Val_F1': result['best_val_f1'],
            'Final_Val_F1': result['final_val_f1'],
            'Final_Val_EM': result['final_val_em'],
            'Final_Start_Acc': result['final_start_acc'],
            'Final_End_Acc': result['final_end_acc'],
            'Training_Time_seconds': result['training_time']
        })
        
        # Detailed epoch-by-epoch results
        for epoch_stats in result['training_stats']:
            detailed_data.append({
                'Optimizer': optimizer_name,
                **epoch_stats
            })
    
    # Create DataFrames
    summary_df = pd.DataFrame(summary_data)
    detailed_df = pd.DataFrame(detailed_data)
    
    # Export to Excel with multiple sheets
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        summary_df.to_excel(writer, sheet_name='Summary', index=False)
        detailed_df.to_excel(writer, sheet_name='Detailed_Results', index=False)
        
        # Create comparison sheet
        comparison_df = summary_df.copy()
        comparison_df = comparison_df.sort_values('Best_Val_F1', ascending=False)
        comparison_df['Rank'] = range(1, len(comparison_df) + 1)
        comparison_df.to_excel(writer, sheet_name='Comparison_Ranking', index=False)
    
    print(f"Results exported to {filename}")
    
    # Display summary
    print("\n" + "="*80)
    print("GPT QA OPTIMIZATION COMPARISON SUMMARY")
    print("="*80)
    print(summary_df.to_string(index=False))
    
    return summary_df, detailed_df

In [27]:
# Run the complete comparison
if __name__ == "__main__":
    print("Starting GPT Fine-tuning Optimization Comparison for Question Answering")
    print(f"Device: {Config.device}")
    print(f"Model: {Config.model_name}")
    print(f"Epochs: {Config.num_epochs}")
    print(f"Batch size: {Config.batch_size}")
    print(f"Learning rate: {Config.learning_rate}")
    print(f"Max length: {Config.max_length}")
    
    # Run the comparison
    results = run_qa_optimization_comparison()
    
    # Export results
    summary_df, detailed_df = export_qa_results_to_excel(results)
    
    print("\nQA Optimization Comparison completed successfully!")
    print("Check your WandB dashboard for detailed metrics and visualizations.")

Starting GPT Fine-tuning Optimization Comparison for Question Answering
Device: cuda
Model: gpt2
Epochs: 3
Batch size: 8
Learning rate: 3e-05
Max length: 512
Loading SQuAD dataset...
Training samples: 10000
Validation samples: 1000

Training GPT-2 for QA with AdamW


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - AdamW
Train Loss: 3.0567, Start Acc: 0.3296, End Acc: 0.3270
Val Loss: 1.7618, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Epoch 2/3 - AdamW
Train Loss: 1.4693, Start Acc: 0.5806, End Acc: 0.5872
Val Loss: 1.7026, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Epoch 3/3 - AdamW
Train Loss: 1.1162, Start Acc: 0.6533, End Acc: 0.6764
Val Loss: 1.6859, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Best validation F1: 1.0000
Final validation F1: 1.0000
Final validation EM: 1.0000
Training time: 3513.97 seconds


0,1
AdamW/best_val_f1,▁
AdamW/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
AdamW/final_val_em,▁
AdamW/final_val_f1,▁
AdamW/learning_rate,▁▂▄▅▇███▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁
AdamW/step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
AdamW/train_end_acc,▁▆█
AdamW/train_loss,█▂▁
AdamW/train_loss_step,█▆▄▃▄▃▃▂▂▂▂▂▁▃▂▂▂▂▁▂▁▁▂▂▂▃▁▁▂▂▁▂▁▁▁▂▂▃▁
AdamW/train_start_acc,▁▆█

0,1
AdamW/best_val_f1,1.0
AdamW/epoch,2.0
AdamW/final_val_em,1.0
AdamW/final_val_f1,1.0
AdamW/learning_rate,0.0
AdamW/step,3700.0
AdamW/train_end_acc,0.6764
AdamW/train_loss,1.11624
AdamW/train_loss_step,0.48174
AdamW/train_start_acc,0.6533



Training GPT-2 for QA with LAMB


Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - LAMB
Train Loss: 2.6944, Start Acc: 0.3558, End Acc: 0.3610
Val Loss: 1.7258, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Epoch 2/3 - LAMB
Train Loss: 1.2796, Start Acc: 0.6182, End Acc: 0.6476
Val Loss: 1.5469, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Epoch 3/3 - LAMB
Train Loss: 0.7767, Start Acc: 0.7512, End Acc: 0.7792
Val Loss: 1.7368, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Best validation F1: 1.0000
Final validation F1: 1.0000
Final validation EM: 1.0000
Training time: 3521.01 seconds


0,1
LAMB/best_val_f1,▁
LAMB/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
LAMB/final_val_em,▁
LAMB/final_val_f1,▁
LAMB/learning_rate,▁▂▄▅▇██████▇▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁
LAMB/step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
LAMB/train_end_acc,▁▆█
LAMB/train_loss,█▃▁
LAMB/train_loss_step,█▆▄▄▂▃▃▂▂▂▂▃▂▂▃▂▂▂▃▂▁▂▁▂▂▁▁▂▁▁▁▁▂▁▂▁▁▁▁
LAMB/train_start_acc,▁▆█

0,1
LAMB/best_val_f1,1.0
LAMB/epoch,2.0
LAMB/final_val_em,1.0
LAMB/final_val_f1,1.0
LAMB/learning_rate,0.0
LAMB/step,3700.0
LAMB/train_end_acc,0.7792
LAMB/train_loss,0.77674
LAMB/train_loss_step,0.15769
LAMB/train_start_acc,0.7512



Training GPT-2 for QA with SGD_warmup


Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - SGD_warmup
Train Loss: 5.0245, Start Acc: 0.0279, End Acc: 0.0328
Val Loss: 4.4010, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Epoch 2/3 - SGD_warmup
Train Loss: 4.2436, Start Acc: 0.0666, End Acc: 0.0683
Val Loss: 4.1075, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Epoch 3/3 - SGD_warmup
Train Loss: 4.0611, Start Acc: 0.0888, End Acc: 0.0829
Val Loss: 4.0087, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Best validation F1: 1.0000
Final validation F1: 1.0000
Final validation EM: 1.0000
Training time: 3447.68 seconds


0,1
SGD_warmup/best_val_f1,▁
SGD_warmup/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
SGD_warmup/final_val_em,▁
SGD_warmup/final_val_f1,▁
SGD_warmup/learning_rate,▁▂▄▅▇███▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁
SGD_warmup/step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
SGD_warmup/train_end_acc,▁▆█
SGD_warmup/train_loss,█▂▁
SGD_warmup/train_loss_step,█▅▄▃▃▃▃▃▂▂▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▂▂▁▁▁▂▁▂▂▁▁▂
SGD_warmup/train_start_acc,▁▅█

0,1
SGD_warmup/best_val_f1,1.0
SGD_warmup/epoch,2.0
SGD_warmup/final_val_em,1.0
SGD_warmup/final_val_f1,1.0
SGD_warmup/learning_rate,0.0
SGD_warmup/step,3700.0
SGD_warmup/train_end_acc,0.0829
SGD_warmup/train_loss,4.0611
SGD_warmup/train_loss_step,3.98435
SGD_warmup/train_start_acc,0.0888



Training GPT-2 for QA with Adafactor


Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Adafactor
Train Loss: 2.9602, Start Acc: 0.3132, End Acc: 0.3178
Val Loss: 1.8465, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Epoch 2/3 - Adafactor
Train Loss: 1.5459, Start Acc: 0.5636, End Acc: 0.5715
Val Loss: 1.6821, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Epoch 3/3 - Adafactor
Train Loss: 1.1797, Start Acc: 0.6438, End Acc: 0.6657
Val Loss: 1.7284, EM: 1.0000, F1: 1.0000
------------------------------------------------------------
Best validation F1: 1.0000
Final validation F1: 1.0000
Final validation EM: 1.0000
Training time: 3562.02 seconds


0,1
Adafactor/best_val_f1,▁
Adafactor/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
Adafactor/final_val_em,▁
Adafactor/final_val_f1,▁
Adafactor/learning_rate,▁▂▄▅▇███▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁
Adafactor/step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
Adafactor/train_end_acc,▁▆█
Adafactor/train_loss,█▂▁
Adafactor/train_loss_step,█▇▆▆▄▄▄▃▂▂▃▃▁▃▂▁▂▃▂▂▃▃▂▂▂▃▃▁▂▂▁▂▂▁▂▂▂▂▂
Adafactor/train_start_acc,▁▆█

0,1
Adafactor/best_val_f1,1.0
Adafactor/epoch,2.0
Adafactor/final_val_em,1.0
Adafactor/final_val_f1,1.0
Adafactor/learning_rate,0.0
Adafactor/step,3700.0
Adafactor/train_end_acc,0.6657
Adafactor/train_loss,1.17971
Adafactor/train_loss_step,1.24405
Adafactor/train_start_acc,0.6438


Results exported to /kaggle/working/gpt_qa_optimization_results.xlsx

GPT QA OPTIMIZATION COMPARISON SUMMARY
 Optimizer  Best_Val_F1  Final_Val_F1  Final_Val_EM  Final_Start_Acc  Final_End_Acc  Training_Time_seconds
     AdamW          1.0           1.0           1.0            0.566          0.567            3513.973404
      LAMB          1.0           1.0           1.0            0.583          0.587            3521.014460
SGD_warmup          1.0           1.0           1.0            0.095          0.075            3447.680111
 Adafactor          1.0           1.0           1.0            0.559          0.559            3562.019782

QA Optimization Comparison completed successfully!
Check your WandB dashboard for detailed metrics and visualizations.
