# COE379L Project 3: Transformer Model Fine-Tuning and Evaluation

## Cross-Model Comparison for News Topic Classification

This notebook covers:
- RoBERTa-base model fine-tuning
- Hugging Face Transformers integration
- Model evaluation and performance metrics
- Training time and inference latency measurement
- Comparison with classical models


## 1. Import Required Libraries


In [3]:
# Standard library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
import os
warnings.filterwarnings('ignore')

# PyTorch
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

# Hugging Face
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    TrainerCallback
)
from datasets import load_dataset

# Scikit-learn for metrics
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    log_loss,
    confusion_matrix,
    classification_report
)

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")

print("\nLibraries imported successfully!")


ModuleNotFoundError: No module named 'torch'

## 2. Load and Prepare Data


In [None]:
# Load AG News dataset from Hugging Face
print("Loading AG News dataset...")
dataset = load_dataset("ag_news")

# Extract train and test splits
train_data = dataset['train']
test_data = dataset['test']

print(f"Training samples: {len(train_data):,}")
print(f"Test samples: {len(test_data):,}")

# Class labels
class_labels = ['World', 'Sports', 'Business', 'Sci/Tech']
num_labels = len(class_labels)
print(f"Number of classes: {num_labels}")
print(f"Classes: {class_labels}")

# Show sample
print("\nSample data:")
print(train_data[0])


## 3. Initialize RoBERTa Tokenizer and Model


In [None]:
# Model name - using RoBERTa-base as specified in requirements
model_name = "roberta-base"

print(f"Loading tokenizer and model: {model_name}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# Move model to device
model = model.to(device)

print(f"Model loaded successfully!")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Test tokenization
sample_text = train_data[0]['text']
print(f"\nSample text: {sample_text[:100]}...")
encoded = tokenizer(sample_text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
print(f"Tokenized shape: {encoded['input_ids'].shape}")


## 4. Preprocess Dataset for Training


In [None]:
def tokenize_function(examples):
    """Tokenize the text data"""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=512,  # RoBERTa max length
        return_tensors=None  # Return as lists, not tensors
    )

print("Tokenizing training data...")
train_tokenized = train_data.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']  # Remove original text column
)

print("Tokenizing test data...")
test_tokenized = test_data.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

print(f"\nTokenization complete!")
print(f"Training features: {train_tokenized.column_names}")
print(f"Test features: {test_tokenized.column_names}")

# Set format for PyTorch
train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("Dataset formatted for PyTorch!")


## 5. Define Metrics Function


In [None]:
def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    
    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro
    }

print("Metrics function defined!")


## 6. Configure Training Arguments


In [None]:
# Training arguments with enhanced progress tracking
training_args = TrainingArguments(
    output_dir='./roberta-ag-news',
    num_train_epochs=3,  # Start with 3 epochs, can adjust
    per_device_train_batch_size=16,  # Adjust based on GPU memory
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,  # More frequent logging for better progress tracking
    evaluation_strategy="steps",  # Evaluate every N steps
    eval_steps=500,  # Evaluate every 500 steps
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=2,  # Keep only 2 best models
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    dataloader_num_workers=2 if torch.cuda.is_available() else 0,
    report_to="none",  # Disable wandb/tensorboard
    logging_first_step=True,  # Log the first step
    prediction_loss_only=False,  # Show more metrics during evaluation
)

# Calculate approximate total steps
num_train_samples = len(train_tokenized)
steps_per_epoch = num_train_samples // training_args.per_device_train_batch_size
if torch.cuda.is_available():
    steps_per_epoch = steps_per_epoch  # Single GPU
else:
    steps_per_epoch = steps_per_epoch  # CPU
total_steps = steps_per_epoch * training_args.num_train_epochs

print("Training arguments configured!")
print(f"Training epochs: {training_args.num_train_epochs}")
print(f"Batch size (train): {training_args.per_device_train_batch_size}")
print(f"Batch size (eval): {training_args.per_device_eval_batch_size}")
print(f"Mixed precision (FP16): {training_args.fp16}")
print(f"Logging steps: {training_args.logging_steps}")
print(f"Evaluation steps: {training_args.eval_steps}")
print(f"Training samples: {num_train_samples:,}")
print(f"Approximate steps per epoch: {steps_per_epoch:,}")
print(f"Total training steps: ~{total_steps:,}")


## 7. Create Trainer and Fine-Tune Model


In [None]:
# Create a custom callback for progress tracking
class ProgressCallback(TrainerCallback):
    """Custom callback to track and display training progress"""
    
    def __init__(self):
        self.start_time = None
        self.epoch_start_time = None
        self.last_log_time = None
        
    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        self.last_log_time = self.start_time
        print("\n" + "=" * 80)
        print("TRAINING STARTED")
        print("=" * 80)
        print(f"Total epochs: {int(args.num_train_epochs)}")
        print(f"Total steps: ~{state.max_steps:,}")
        print(f"Device: {device}")
        print(f"Batch size: {args.per_device_train_batch_size}")
        print("=" * 80 + "\n")
        
    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()
        current_epoch = int(state.epoch) + 1
        print(f"\n{'='*80}")
        print(f"EPOCH {current_epoch}/{int(args.num_train_epochs)}")
        print(f"{'='*80}")
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            current_time = time.time()
            elapsed_total = current_time - self.start_time
            
            # Display training progress
            if 'loss' in logs and 'learning_rate' in logs:
                step = state.global_step
                loss = logs['loss']
                lr = logs['learning_rate']
                elapsed_since_last = current_time - self.last_log_time
                
                # Calculate progress percentage
                progress_pct = (step / state.max_steps) * 100 if state.max_steps > 0 else 0
                
                print(f"Step {step:6d}/{state.max_steps} ({progress_pct:5.1f}%) | "
                      f"Loss: {loss:.4f} | LR: {lr:.2e} | "
                      f"Time: {elapsed_total/60:6.1f}min | "
                      f"Epoch: {state.epoch:.2f}/{args.num_train_epochs}")
                
                self.last_log_time = current_time
                
            # Display evaluation results
            if 'eval_loss' in logs:
                print(f"\n{'─'*80}")
                print(f"EVALUATION RESULTS (Step {state.global_step}):")
                print(f"  Loss: {logs.get('eval_loss', 'N/A'):.4f}")
                print(f"  Accuracy: {logs.get('eval_accuracy', 'N/A'):.4f}")
                print(f"  F1-Macro: {logs.get('eval_f1_macro', 'N/A'):.4f}")
                eval_time = current_time - self.last_log_time
                print(f"  Eval time: {eval_time:.1f}s")
                print(f"{'─'*80}\n")
                self.last_log_time = current_time
                
    def on_epoch_end(self, args, state, control, **kwargs):
        epoch_time = time.time() - self.epoch_start_time
        total_time = time.time() - self.start_time
        current_epoch = int(state.epoch) + 1
        
        print(f"\n{'─'*80}")
        print(f"Epoch {current_epoch} completed!")
        print(f"  Epoch time: {epoch_time/60:.2f} minutes")
        print(f"  Total time: {total_time/60:.2f} minutes")
        
        if current_epoch < int(args.num_train_epochs):
            remaining_epochs = int(args.num_train_epochs) - current_epoch
            avg_epoch_time = total_time / current_epoch
            estimated_remaining = (remaining_epochs * avg_epoch_time) / 60
            print(f"  Estimated time remaining: {estimated_remaining:.2f} minutes")
        print(f"{'─'*80}\n")
        
    def on_train_end(self, args, state, control, **kwargs):
        total_time = time.time() - self.start_time
        print("\n" + "=" * 80)
        print("TRAINING COMPLETED")
        print("=" * 80)
        print(f"Total training time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
        print(f"Total steps completed: {state.global_step}")
        print(f"Epochs completed: {state.epoch:.2f}")
        print("=" * 80)

# Create trainer with progress callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,  # Using test set for evaluation during training
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2),  # Stop if no improvement for 2 epochs
        ProgressCallback()  # Custom progress tracker
    ]
)

print("Trainer created with progress tracking!")
print("Starting fine-tuning...")
print("Progress will be displayed below:\n")

# Record training start time
training_start_time = time.time()

# Train the model (progress will be shown by the callback)
trainer.train()

# Record training end time
training_time = time.time() - training_start_time

print("\n" + "=" * 80)
print(f"Fine-tuning completed!")
print(f"Total time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
print("=" * 80)


## 8. Evaluate Model on Test Set


In [None]:
# Evaluate on test set
print("Evaluating on test set...")
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
print(f"  Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"  Macro F1-Score: {eval_results['eval_f1_macro']:.4f}")

# Get predictions for detailed metrics
print("\nGenerating predictions...")
predictions = trainer.predict(test_tokenized)

# Extract predictions and labels
y_pred = np.argmax(predictions.predictions, axis=1)
y_test = predictions.label_ids
y_pred_proba = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Calculate additional metrics
accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
log_loss_score = log_loss(y_test, y_pred_proba)

print(f"\nDetailed Metrics:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Macro F1-Score: {f1_macro:.4f}")
print(f"  Log Loss: {log_loss_score:.4f}")


## 9. Measure Inference Latency


In [None]:
# Measure inference latency for 1000 samples
print("Measuring inference latency...")

# Sample 1000 test examples
num_samples = 1000
sample_indices = np.random.choice(len(test_tokenized), num_samples, replace=False)
sample_data = test_tokenized.select(sample_indices)

# Warm-up
_ = trainer.predict(sample_data.select(range(10)))

# Measure inference time
inference_start_time = time.time()
_ = trainer.predict(sample_data)
inference_time = time.time() - inference_start_time

# Calculate latency per 1000 samples
inference_latency_per_1k = inference_time

print(f"Inference time for {num_samples} samples: {inference_time:.4f} seconds")
print(f"Inference latency per 1,000 samples: {inference_latency_per_1k:.4f} seconds")


## 10. Save Results


In [None]:
# Compile results
roberta_results = {
    'Model': 'RoBERTa-base',
    'Accuracy': accuracy,
    'Macro F1-Score': f1_macro,
    'Log Loss': log_loss_score,
    'Training Time (s)': training_time,
    'Inference Latency per 1k (s)': inference_latency_per_1k
}

# Create DataFrame
results_df = pd.DataFrame([roberta_results])

print("=" * 80)
print("ROBERTA MODEL - RESULTS SUMMARY")
print("=" * 80)
print(results_df.to_string(index=False))
print("=" * 80)

# Ensure data directory exists
os.makedirs('data', exist_ok=True)

# Save results
results_df.to_csv('data/roberta_results.csv', index=False)
print("\nResults saved to data/roberta_results.csv")

# Save model
print("\nSaving model...")
os.makedirs('data/roberta_ag_news_model', exist_ok=True)
trainer.save_model('data/roberta_ag_news_model')
tokenizer.save_pretrained('data/roberta_ag_news_model')
print("Model saved to data/roberta_ag_news_model/")


## 11. Confusion Matrix


In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - RoBERTa-base', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig('data/roberta_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print("Confusion matrix saved to data/roberta_confusion_matrix.png")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=class_labels))


## 12. Compare with Classical Models

Load classical models results and create comparison.


In [None]:
# Load classical models results if available
classical_results_path = 'data/classical_models_results.csv'

if os.path.exists(classical_results_path):
    classical_df = pd.read_csv(classical_results_path)
    
    # Combine results
    all_results = pd.concat([classical_df, results_df], ignore_index=True)
    
    print("=" * 80)
    print("ALL MODELS - COMPARISON")
    print("=" * 80)
    print(all_results.to_string(index=False))
    print("=" * 80)
    
    # Save combined results
    all_results.to_csv('data/all_models_results.csv', index=False)
    print("\nCombined results saved to data/all_models_results.csv")
    
    # Visualization: F1-Score comparison
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    models = all_results['Model'].values
    f1_scores = all_results['Macro F1-Score'].values
    training_times = all_results['Training Time (s)'].values
    
    # F1-Score comparison
    axes[0].bar(models, f1_scores, color=['steelblue', 'coral', 'lightgreen', 'purple'])
    axes[0].set_title('Macro F1-Score Comparison - All Models', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('Macro F1-Score', fontsize=12)
    axes[0].set_ylim([0, 1])
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(axis='y', alpha=0.3)
    for i, v in enumerate(f1_scores):
        axes[0].text(i, v + 0.01, f'{v:.4f}', ha='center', fontweight='bold')
    
    # Training time comparison
    axes[1].bar(models, training_times, color=['steelblue', 'coral', 'lightgreen', 'purple'])
    axes[1].set_title('Training Time Comparison - All Models', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('Training Time (seconds)', fontsize=12)
    axes[1].set_yscale('log')  # Log scale for better visualization
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(axis='y', alpha=0.3)
    for i, v in enumerate(training_times):
        axes[1].text(i, v * 1.2, f'{v:.1f}s', ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('data/all_models_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("Comparison visualization saved to data/all_models_comparison.png")
    
else:
    print("Classical models results not found. Run 02_Classical_Models.ipynb first.")
    print("RoBERTa results saved separately.")
