# Training Notebook for News Topic Classification

Model: **DistilBERT-base-uncased**

Dataset: **AG News** (120K articles, 4 topics)

Expected Time: **30-45 minutes on T4 GPU**

Expected Accuracy: **93-95%**

Actual Accuracy: **94.82%**

## Setup

In [None]:
# @title Mount to Google Drive

from google.colab import drive
drive.mount('/content/drive')

# Create directory for this project in your Drive
import os
DRIVE_DIR = '/content/drive/MyDrive/NewsTopicClassification' # @param {"type":"string"}
os.makedirs(DRIVE_DIR, exist_ok=True)
print(f"✅ Google Drive mounted!")
print(f"📁 Project directory: {DRIVE_DIR}")

In [None]:
# @title Import packages & check GPU status

# @markdown


!pip install -q transformers datasets accelerate wandb evaluate scikit-learn

import torch
print(f"\n🎮 GPU Device: {torch.cuda.get_device_name(0)}")
print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print(f"🔥 CUDA Available: {torch.cuda.is_available()}")

import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import evaluate
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# @title Load dataset

dataset = load_dataset("ag_news")

print(f"✅ Dataset loaded!")
print(f"\n📊 Dataset structure:")
print(dataset)

print(f"\n🔢 Dataset sizes:")
print(f"  Training: {len(dataset['train']):,} articles")
print(f"  Test: {len(dataset['test']):,} articles")

# Define label names
label_names = ['World', 'Sports', 'Business', 'Sci/Tech']
id2label = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
label2id = {'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}

print(f"\n🏷️  Topics: {label_names}")

# Show examples
print("\n📰 Sample articles:")
for i in range(3):
    example = dataset['train'][i]
    print(f"\n{i+1}. Topic: {label_names[example['label']]}")
    print(f"   Text: {example['text'][:100]}...")

# Check class distribution
from collections import Counter

train_labels = [example['label'] for example in dataset['train']]
label_counts = Counter(train_labels)

print(f"\n📊 Class distribution (training set):")
for label_id, count in sorted(label_counts.items()):
    percentage = (count / len(train_labels)) * 100
    print(f"  {label_names[label_id]}: {count:,} ({percentage:.1f}%)")

In [None]:
# @title Load DistilBERT tokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"✅ Tokenizer loaded: {model_name}")
print(f"📖 Vocabulary size: {tokenizer.vocab_size:,} tokens")

# Test tokenization
test_text = "Apple announces new iPhone with improved camera"
tokens = tokenizer.tokenize(test_text)
token_ids = tokenizer.encode(test_text)

print(f"\n🧪 Tokenization test:")
print(f"  Original: {test_text}")
print(f"  Tokens: {tokens}")
print(f"  Token IDs: {token_ids}")

# Tokenization function
def tokenize_function(examples):
    """Tokenize text with padding and truncation"""
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256  # News articles/titles are usually short
    )

print(f"\n⚙️  Tokenizing entire dataset...")

# Apply tokenization to entire dataset
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=2,  # Parallel processing
    remove_columns=["text"]  # Remove original text to save memory
)

# Rename label column for trainer
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

print(f"✅ Tokenization complete!")
print(f"   Example tokenized sample:")
print(f"   Input IDs shape: {len(tokenized_datasets['train'][0]['input_ids'])}")
print(f"   Attention mask shape: {len(tokenized_datasets['train'][0]['attention_mask'])}")

In [None]:
# @title Load pre-trained model with classification head

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,  # 4 topic categories
    id2label=id2label,
    label2id=label2id
)

print(f"✅ Model loaded!")

# Model info
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n📊 Model statistics:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Model size: ~{total_params * 4 / (1024**2):.1f} MB")

In [None]:
# @title Setup evaluation metrics

# @markdown compute_metrics(predictions)

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    """Compute accuracy, F1, precision, recall"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')

    return {
        'accuracy': accuracy['accuracy'],
        'f1': f1['f1'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

print("✅ Metrics defined: accuracy, F1, precision, recall")

In [None]:
# @title Training configurations

training_args = TrainingArguments(
    # Output directory - SAVE TO GOOGLE DRIVE!
    output_dir=f"{DRIVE_DIR}/training_checkpoints",

    # Training duration
    num_train_epochs=3,

    # Batch sizes (adjust if you get OOM errors)
    per_device_train_batch_size=32,   # For T4: 32 works well
    per_device_eval_batch_size=64,    # Evaluation can use larger batch

    # Optimizer settings
    learning_rate=2e-5,
    weight_decay=0.01,  # L2 regularization

    # Learning rate schedule
    warmup_steps=500,
    lr_scheduler_type="linear",

    # Evaluation strategy
    eval_strategy="steps",
    eval_steps=500,  # Evaluate every 500 steps
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,

    # Performance optimizations
    fp16=True,  # Mixed precision training (2x faster!)
    dataloader_num_workers=2,

    # Logging - SAVE TO GOOGLE DRIVE!
    logging_dir=f"{DRIVE_DIR}/logs",
    logging_steps=100,
    report_to="wandb",  # Comment out if not using wandb

    # Misc
    save_total_limit=2,  # Keep only 2 checkpoints (save space)
    seed=42,  # Reproducibility
    push_to_hub=False,  # Set True to upload to HuggingFace
)

print("✅ Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Train batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Mixed precision: {training_args.fp16}")

print("\n🏋️  Creating trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# @title Training and Evaluating

print("\n" + "="*70)
print("🚀 STARTING TRAINING")
print("="*70)
print("\n⏱️  Expected time: 30-45 minutes on T4 GPU")
print("📊 Watch progress in real-time at: https://wandb.ai/\n")

# Train!
train_result = trainer.train()

print("\n" + "="*70)
print("✅ TRAINING COMPLETE!")
print("="*70)

# Print training summary
print("\n📊 Training Summary:")
print(f"  Total training time: {train_result.metrics['train_runtime']:.1f} seconds")
print(f"  Samples per second: {train_result.metrics['train_samples_per_second']:.1f}")
print(f"  Training loss: {train_result.metrics['train_loss']:.4f}")

print("\n📊 Evaluating on test set...")

# Evaluate
eval_results = trainer.evaluate()

print("\n" + "="*70)
print("📈 FINAL RESULTS")
print("="*70)
print(f"\n✨ Test Accuracy: {eval_results['eval_accuracy']:.4f} ({eval_results['eval_accuracy']*100:.2f}%)")
print(f"✨ F1 Score: {eval_results['eval_f1']:.4f}")
print(f"✨ Precision: {eval_results['eval_precision']:.4f}")
print(f"✨ Recall: {eval_results['eval_recall']:.4f}")

## Model evaluation and summary

In [None]:
# @title Saving model

# Save final model to Google Drive
model_save_path = f"{DRIVE_DIR}/final_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"✅ Model saved to Google Drive: {model_save_path}")
print(f"\n📁 Your Google Drive now contains:")
print(f"  {DRIVE_DIR}/")
print(f"  ├── final_model/           ← Your trained model (USE THIS!)")
print(f"  ├── training_checkpoints/  ← Intermediate checkpoints")
print(f"  └── logs/                  ← Training logs")

# Also save to temporary location for quick testing
temp_save_path = "./final_model_temp"
trainer.save_model(temp_save_path)
tokenizer.save_pretrained(temp_save_path)
print(f"\n💡 Temporary copy also saved to: {temp_save_path}")
print(f"   (This will be deleted when session ends)")

print("\n📦 Creating downloadable zip file (optional backup)...")
!zip -r -q final_model.zip {model_save_path}

print("✅ Model zipped! You can download 'final_model.zip' from the file browser")
print("   (But it's already safe in your Google Drive!)")

In [None]:
# @title Generate analysis

# Get predictions
predictions = trainer.predict(tokenized_datasets["test"])
y_pred = np.argmax(predictions.predictions, axis=-1)
y_true = tokenized_datasets["test"]["labels"]

# Classification report
print("\n📋 Classification Report:")
print("="*70)
report = classification_report(
    y_true,
    y_pred,
    target_names=label_names,
    digits=4
)
print(report)

# Confusion matrix
print("\n🔢 Confusion Matrix:")
cm = confusion_matrix(y_true, y_pred)
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=label_names,
    yticklabels=label_names
)
plt.title('Confusion Matrix - Topic Classification', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
print("✅ Confusion matrix saved as 'confusion_matrix.png'")
plt.show()

# Per-class accuracy
print("\n📊 Per-class Accuracy:")
for i, label in enumerate(label_names):
    correct = cm[i, i]
    total = cm[i, :].sum()
    accuracy = correct / total * 100
    print(f"  {label:12s}: {correct:5d}/{total:5d} = {accuracy:5.2f}%")

# Find misclassified examples
print("\n❌ Sample Misclassifications:")
misclassified_indices = np.where(y_pred != y_true)[0][:5]  # First 5 errors

for idx in misclassified_indices:
    idx = int(idx)
    original_idx = int(idx)
    true_label = label_names[y_true[idx]]
    pred_label = label_names[y_pred[idx]]
    text = dataset['test'][original_idx]['text'][:150]

    print(f"\n  Text: {text}...")
    print(f"  True: {true_label} | Predicted: {pred_label}")

# Save visualizations to Drive
import shutil
shutil.copy('confusion_matrix.png', f"{DRIVE_DIR}/confusion_matrix.png")
print(f"\n✅ Confusion matrix also saved to Google Drive")

In [None]:
# @title Testing on custom examples

def predict_topic(text):
    """Predict topic for custom text"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    predicted_class = torch.argmax(predictions, dim=-1).item()
    confidence = predictions[0][predicted_class].item()

    return {
        'topic': label_names[predicted_class],
        'confidence': confidence,
        'all_scores': {label_names[i]: predictions[0][i].item() for i in range(4)}
    }

# Test examples
test_examples = [
    "Apple announces new iPhone 15 with improved camera and battery life",
    "Lakers defeat Warriors 120-110 in overtime thriller",
    "Stock market reaches all-time high as tech stocks surge",
    "Scientists discover new planet in habitable zone",
    "Tesla reports record quarterly earnings, stock jumps 15%",
    "World leaders meet for climate summit in Paris"
]

print("\n" + "="*70)
print("🎯 PREDICTIONS ON CUSTOM EXAMPLES")
print("="*70)

for i, text in enumerate(test_examples, 1):
    result = predict_topic(text)
    print(f"\n{i}. {text}")
    print(f"   → {result['topic']} ({result['confidence']:.2%} confidence)")
    print(f"   Scores: ", end="")
    for topic, score in result['all_scores'].items():
        print(f"{topic}: {score:.3f}  ", end="")
    print()

In [None]:
# @title Create results summary
summary = {
    'model': model_name,
    'dataset': 'AG News',
    'train_samples': len(dataset['train']),
    'test_samples': len(dataset['test']),
    'epochs': training_args.num_train_epochs,
    'batch_size': training_args.per_device_train_batch_size,
    'learning_rate': training_args.learning_rate,
    'accuracy': eval_results['eval_accuracy'],
    'f1_score': eval_results['eval_f1'],
    'precision': eval_results['eval_precision'],
    'recall': eval_results['eval_recall'],
    'training_time_seconds': train_result.metrics['train_runtime'],
}

# Save as JSON to Google Drive
import json
with open(f'{DRIVE_DIR}/training_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("\n📄 Results summary saved to Google Drive: training_summary.json")

# Save as markdown report to Google Drive
report_md = f"""# Topic Classification Model - Training Report

## Model Information
- **Model**: {summary['model']}
- **Dataset**: {summary['dataset']}
- **Training Samples**: {summary['train_samples']:,}
- **Test Samples**: {summary['test_samples']:,}

## Training Configuration
- **Epochs**: {summary['epochs']}
- **Batch Size**: {summary['batch_size']}
- **Learning Rate**: {summary['learning_rate']}
- **Training Time**: {summary['training_time_seconds']:.1f} seconds ({summary['training_time_seconds']/60:.1f} minutes)

## Results
- **Accuracy**: {summary['accuracy']:.4f} ({summary['accuracy']*100:.2f}%)
- **F1 Score**: {summary['f1_score']:.4f}
- **Precision**: {summary['precision']:.4f}
- **Recall**: {summary['recall']:.4f}

## Per-Class Performance
{report}

## Files Saved to Google Drive
- `{DRIVE_DIR}/final_model/` - Trained model (use this!)
- `{DRIVE_DIR}/training_checkpoints/` - Training checkpoints
- `{DRIVE_DIR}/logs/` - Training logs
- `{DRIVE_DIR}/confusion_matrix.png` - Visualization
- `{DRIVE_DIR}/training_summary.json` - Metrics
- `{DRIVE_DIR}/training_report.md` - This report

## Next Steps
1. ✅ Model trained and evaluated
2. ✅ Saved to Google Drive (permanent!)
3. 🔄 Integrate into news recommender pipeline
4. 🚀 Deploy for production use
"""

with open(f'{DRIVE_DIR}/training_report.md', 'w') as f:
    f.write(report_md)

print("✅ Detailed report saved to Google Drive: training_report.md")

print("\n" + "="*70)
print("🎉 ALL DONE!")
print("="*70)

print(f"""
📊 Final Summary:
  ✨ Model: DistilBERT-base-uncased
  ✨ Accuracy: {eval_results['eval_accuracy']*100:.2f}%
  ✨ F1 Score: {eval_results['eval_f1']:.4f}
  ✨ Training Time: {train_result.metrics['train_runtime']/60:.1f} minutes

📁 All Files Saved to Google Drive:
  ✅ {DRIVE_DIR}/final_model/
  ✅ {DRIVE_DIR}/training_checkpoints/
  ✅ {DRIVE_DIR}/logs/
  ✅ {DRIVE_DIR}/confusion_matrix.png
  ✅ {DRIVE_DIR}/training_summary.json
  ✅ {DRIVE_DIR}/training_report.md

💡 How to Access Your Files:
  1. Open Google Drive (drive.google.com)
  2. Navigate to: My Drive > news_recommender
  3. Download the 'final_model' folder
  4. Use it in your project!
""")

# Close wandb
# wandb.finish()

## Inference

In [None]:
# ============================================================================
# QUICK REFERENCE: HOW TO LOAD THIS MODEL LATER
# ============================================================================
"""
# To load this model in your FastAPI backend:

from transformers import pipeline

# Load from saved directory
classifier = pipeline(
    "text-classification",
    model="./final_model",
    device=0  # Use GPU if available, -1 for CPU
)

# Classify an article
result = classifier("Apple announces new iPhone")[0]
print(result)
# Output: {'label': 'Sci/Tech', 'score': 0.96}

# Batch classification (faster!)
articles = [
    "Lakers win championship",
    "Stock market hits record high",
    "New AI breakthrough announced"
]
results = classifier(articles)
for article, result in zip(articles, results):
    print(f"{article[:30]}... → {result['label']} ({result['score']:.2f})")
"""