# AG News Text Classification - Google Colab Quick Start

This notebook provides a complete quick start guide for AG News classification in Google Colab.

**Educational Approach**: Progressive disclosure from simple to complex operations
- Reference: Wing (2006) - "Computational Thinking"
- Reference: Guzdial (2015) - "Learner-Centered Design of Computing Education"

## 1. Environment Setup

First, we need to set up the Colab environment with necessary dependencies.

In [None]:
# Check GPU availability
import torch
import sys

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
# Clone repository
!git clone https://github.com/yourusername/ag-news-text-classification.git
%cd ag-news-text-classification

In [None]:
# Install dependencies
!pip install -q -r requirements/minimal.txt
print("Dependencies installed successfully!")

## 2. Data Preparation

Download and prepare the AG News dataset.

In [None]:
# Download AG News data
!python scripts/setup/download_all_data.py --dataset ag_news

# Prepare data splits
!python scripts/data_preparation/prepare_ag_news.py

In [None]:
# Load and explore data
import pandas as pd
from pathlib import Path

data_dir = Path("data/processed")
train_df = pd.read_csv(data_dir / "train.csv")
val_df = pd.read_csv(data_dir / "validation.csv")
test_df = pd.read_csv(data_dir / "test.csv")

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")
print("\nLabel distribution:")
print(train_df['label'].value_counts())
print("\nSample data:")
train_df.head()

## 3. Quick Model Training

Train a simple DistilBERT model for quick results.

In [None]:
# Import required modules
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from tqdm.notebook import tqdm
import numpy as np

In [None]:
# Simple dataset class
class AGNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Initialize model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(f"Model loaded: {model_name}")
print(f"Device: {device}")

In [None]:
# Create data loaders
train_dataset = AGNewsDataset(
    train_df["text"].values,
    train_df["label"].values,
    tokenizer
)

val_dataset = AGNewsDataset(
    val_df["text"].values,
    val_df["label"].values,
    tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
# Training loop
from transformers import AdamW, get_linear_schedule_with_warmup

# Setup optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 2
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

# Training
model.train()
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training")
    
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})
    
    avg_loss = total_loss / len(train_loader)
    print(f"Average training loss: {avg_loss:.4f}")

## 4. Model Evaluation

Evaluate the trained model on validation and test sets.

In [None]:
# Evaluation function
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(**batch)
            logits = outputs.logits
            
            preds = torch.argmax(logits, dim=-1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy, all_preds, all_labels

# Evaluate on validation set
val_acc, val_preds, val_labels = evaluate_model(model, val_loader, device)
print(f"Validation Accuracy: {val_acc:.4f}")

# Classification report
class_names = ["World", "Sports", "Business", "Sci/Tech"]
print("\nClassification Report:")
print(classification_report(val_labels, val_preds, target_names=class_names))

## 5. Interactive Prediction

Test the model with custom text inputs.

In [None]:
def predict_text(text, model, tokenizer, device):
    """
    Predict class for a single text input.
    """
    model.eval()
    
    # Tokenize
    inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    ).to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)
        pred = torch.argmax(logits, dim=-1)
    
    class_names = ["World", "Sports", "Business", "Sci/Tech"]
    pred_class = class_names[pred.item()]
    confidence = probs[0][pred].item()
    
    return pred_class, confidence, probs[0].cpu().numpy()

# Test predictions
test_texts = [
    "Apple announces new iPhone with revolutionary camera system",
    "Stock market reaches all-time high amid economic recovery",
    "Scientists discover new planet in nearby solar system",
    "Local team wins championship in thrilling overtime victory"
]

for text in test_texts:
    pred_class, confidence, probs = predict_text(text, model, tokenizer, device)
    print(f"\nText: {text[:60]}...")
    print(f"Predicted: {pred_class} (confidence: {confidence:.4f})")
    print(f"All probabilities: {dict(zip(['World', 'Sports', 'Business', 'Sci/Tech'], probs.round(4)))}")

## 6. Save and Load Model

Save the trained model for future use.

In [None]:
# Save model
output_dir = "outputs/colab_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

# Test loading
loaded_model = AutoModelForSequenceClassification.from_pretrained(output_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)
print("Model loaded successfully!")

## 7. Advanced Features (Optional)

Explore advanced features like data augmentation and ensemble methods.

In [None]:
# Data augmentation example
!python scripts/data_preparation/create_augmented_data.py --method back_translation --samples 100

# Load augmented data
augmented_df = pd.read_csv("data/augmented/back_translated/train_augmented.csv")
print(f"Augmented samples: {len(augmented_df)}")
augmented_df.head()

## 8. Download Results

Download the trained model and results to your local machine.

In [None]:
# Create zip file
!zip -r colab_results.zip outputs/colab_model

# Download
from google.colab import files
files.download('colab_results.zip')
print("Results downloaded!")

## Summary

Congratulations! You have successfully:
1. Set up the environment in Google Colab
2. Prepared the AG News dataset
3. Trained a DistilBERT classifier
4. Evaluated model performance
5. Made predictions on custom text
6. Saved the trained model

### Next Steps

- Try different models (RoBERTa, DeBERTa)
- Experiment with hyperparameters
- Explore ensemble methods
- Use advanced training techniques

For more advanced features, check out the full documentation and other notebooks in the repository.