# News Category Classification Training
## GPU-Accelerated Training on Google Colab

**Before starting:**
1. Enable GPU: `Runtime` → `Change runtime type` → `Hardware accelerator` → `T4 GPU`
2. Upload dataset to this session


In [None]:
# Install dependencies
!pip install transformers torch scikit-learn pandas numpy tqdm matplotlib seaborn -q

In [None]:
# Upload dataset
from google.colab import files
import os

print("Please upload News_Category_Dataset_v3.json")
uploaded = files.upload()

# Create data directory
!mkdir -p data
!mv News_Category_Dataset_v3.json data/

In [None]:
# Data loader code
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pathlib import Path

def load_dataset(data_path="data/News_Category_Dataset_v3.json"):
    print(f"Loading dataset from {data_path}...")
    data = []
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    
    df = pd.DataFrame(data)
    print(f"Loaded {len(df)} records")
    
    # Combine text
    df['text'] = df['headline'] + " " + df['short_description']
    df = df.dropna(subset=['text', 'category'])
    
    # Encode labels
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['category'])
    num_classes = len(label_encoder.classes_)
    
    print(f"Number of categories: {num_classes}")
    
    # Train/val split
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
    print(f"Training: {len(train_df)}, Validation: {len(val_df)}")
    
    # Save label encoder
    !mkdir -p models
    np.save('models/label_encoder.npy', label_encoder.classes_)
    
    return train_df, val_df, label_encoder, num_classes

train_df, val_df, label_encoder, num_classes = load_dataset()

In [None]:
# PyTorch Dataset
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
# Training
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score

# Initialize
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Create datasets
train_dataset = NewsDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer)
val_dataset = NewsDataset(val_df['text'].tolist(), val_df['label'].tolist(), tokenizer)

# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'f1_weighted': f1_score(labels, predictions, average='weighted')
    }

# Training args
training_args = TrainingArguments(
    output_dir='models',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model='f1_weighted',
    fp16=True  # GPU acceleration
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("Starting training with GPU...")
trainer.train()

In [None]:
# Evaluate
results = trainer.evaluate()
print("\nValidation Results:")
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"F1 (Macro): {results['eval_f1_macro']:.4f}")
print(f"F1 (Weighted): {results['eval_f1_weighted']:.4f}")

In [None]:
# Save model
trainer.save_model('models')
tokenizer.save_pretrained('models')
print("Model saved!")

In [None]:
# Download trained model
!zip -r models.zip models/
files.download('models.zip')
print("Download complete! Extract and place in your local project's models/ directory.")