# Lab 2.4.4 Solutions: Trainer Fine-tuning

This notebook contains solutions to the exercises in the Trainer Fine-tuning notebook.

In [None]:
# Setup
import torch
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
import evaluate
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## Exercise Solution: Train on AG News

Fine-tune a model on the AG News dataset (news category classification):
1. Load the `ag_news` dataset
2. Create appropriate splits
3. Tokenize with a model of your choice
4. Configure TrainingArguments
5. Train and evaluate

In [None]:
# Step 1: Load AG News dataset
print("Loading AG News dataset...")
ag_news = load_dataset("ag_news")

# Use subset for faster training in this demo
small_train = ag_news['train'].shuffle(seed=42).select(range(8000))
small_test = ag_news['test'].shuffle(seed=42).select(range(2000))

print(f"Train samples: {len(small_train)}")
print(f"Test samples: {len(small_test)}")

# Show label distribution
print(f"\nLabels:")
print("  0 = World")
print("  1 = Sports")
print("  2 = Business")
print("  3 = Sci/Tech")

In [None]:
# Step 2: Create train/validation split
train_val = small_train.train_test_split(
    test_size=0.1,
    seed=42,
    stratify_by_column='label'
)

dataset = DatasetDict({
    'train': train_val['train'],
    'validation': train_val['test'],
    'test': small_test
})

print(f"Train: {len(dataset['train'])}")
print(f"Validation: {len(dataset['validation'])}")
print(f"Test: {len(dataset['test'])}")

In [None]:
# Step 3: Load model and tokenizer for 4-class classification
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    id2label={0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"},
    label2id={"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3}
)

print(f"Model loaded with {model.num_parameters():,} parameters")

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128  # AG News articles are relatively short
    )

print("Tokenizing...")
tokenized = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,
    num_proc=4,
    remove_columns=['text']
)

# Rename label to labels
tokenized = tokenized.rename_column('label', 'labels')

print(f"Columns: {tokenized['train'].column_names}")

In [None]:
# Step 4: Configure TrainingArguments

# Load metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    """Compute accuracy and macro F1 for multi-class."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return {
        'accuracy': accuracy.compute(predictions=predictions, references=labels)['accuracy'],
        'f1_macro': f1.compute(predictions=predictions, references=labels, average='macro')['f1'],
        'f1_weighted': f1.compute(predictions=predictions, references=labels, average='weighted')['f1']
    }

training_args = TrainingArguments(
    output_dir="./ag_news_results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    bf16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    save_total_limit=2
)

print("TrainingArguments configured!")

In [None]:
# Step 5: Create Trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Training...")
train_result = trainer.train()

print(f"\nTraining complete!")
print(f"Time: {train_result.metrics['train_runtime']:.1f}s")
print(f"Loss: {train_result.training_loss:.4f}")

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(tokenized['test'])

print("\nTest Results:")
for key, value in test_results.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.4f}")

In [None]:
# Detailed classification report
from sklearn.metrics import classification_report, confusion_matrix

predictions = trainer.predict(tokenized['test'])
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

print("Classification Report:")
print(classification_report(
    true_labels,
    pred_labels,
    target_names=["World", "Sports", "Business", "Sci/Tech"]
))

print("\nConfusion Matrix:")
cm = confusion_matrix(true_labels, pred_labels)
print(cm)

In [None]:
# Test on new examples
test_texts = [
    "The president announced new trade agreements with European nations today.",
    "The Lakers won the championship game in overtime with a buzzer-beater.",
    "Apple stock surged 10% after announcing record quarterly earnings.",
    "Scientists discover new exoplanet that may contain water."
]

expected = ["World", "Sports", "Business", "Sci/Tech"]

print("Testing on new examples:\n")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device).eval()

for text, exp in zip(test_texts, expected):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    probs = torch.softmax(outputs.logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    conf = probs[0][pred].item()
    
    pred_label = model.config.id2label[pred]
    status = "✓" if pred_label == exp else "✗"
    
    print(f"{status} '{text[:50]}...'")
    print(f"   Expected: {exp}, Predicted: {pred_label} ({conf:.2%})\n")

## Challenge Solution: Multi-class Emotion Detection

Train an emotion classifier using the `emotion` dataset (6 emotions).

In [None]:
# Load emotion dataset
print("Loading emotion dataset...")
emotion = load_dataset("emotion")

print(f"Train: {len(emotion['train'])}")
print(f"Test: {len(emotion['test'])}")

# Emotions: 0=sadness, 1=joy, 2=love, 3=anger, 4=fear, 5=surprise
emotion_labels = ["sadness", "joy", "love", "anger", "fear", "surprise"]
print(f"\nEmotion labels: {emotion_labels}")

In [None]:
# Create splits and prepare model
train_val = emotion['train'].train_test_split(test_size=0.1, seed=42)
emotion_dataset = DatasetDict({
    'train': train_val['train'],
    'validation': train_val['test'],
    'test': emotion['test']
})

# Load fresh model for 6 classes
emotion_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=6,
    id2label={i: label for i, label in enumerate(emotion_labels)},
    label2id={label: i for i, label in enumerate(emotion_labels)}
)

# Tokenize
emotion_tokenized = emotion_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)
emotion_tokenized = emotion_tokenized.rename_column('label', 'labels')

In [None]:
# Train emotion classifier
emotion_args = TrainingArguments(
    output_dir="./emotion_results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    bf16=torch.cuda.is_available(),
    report_to="none",
    save_total_limit=1
)

emotion_trainer = Trainer(
    model=emotion_model,
    args=emotion_args,
    train_dataset=emotion_tokenized['train'],
    eval_dataset=emotion_tokenized['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training emotion classifier...")
emotion_trainer.train()

In [None]:
# Evaluate emotion classifier
emotion_results = emotion_trainer.evaluate(emotion_tokenized['test'])

print("\nEmotion Classification Results:")
for key, value in emotion_results.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.4f}")

# Check if we achieved >90% accuracy
if emotion_results['eval_accuracy'] > 0.90:
    print("\n✓ Challenge complete! Achieved >90% accuracy!")
else:
    print(f"\n✗ Accuracy is {emotion_results['eval_accuracy']:.1%}, target is >90%")
    print("  Try: more epochs, different learning rate, or larger model")

In [None]:
# Cleanup
import shutil
import gc

for path in ["./ag_news_results", "./emotion_results"]:
    if os.path.exists(path):
        shutil.rmtree(path)

del model, emotion_model, trainer, emotion_trainer
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

print("Cleanup complete!")

## Summary

In this solution notebook, we demonstrated:

1. **AG News Classification**:
   - 4-class news topic classification
   - Custom metrics (macro/weighted F1)
   - Detailed evaluation with classification report

2. **Emotion Detection**:
   - 6-class emotion classification
   - Target accuracy >90%

Key learnings:
- Configure `num_labels` for multi-class
- Use appropriate F1 averaging for multi-class
- Early stopping prevents overfitting
- Classification reports help identify weak classes