In [19]:
# Install required packages
!pip install transformers datasets gradio scikit-learn accelerate

import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import gradio as gr
from torch.utils.data import DataLoader
import time
import random

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"⚡ Using device: {device}")

class FastTrainingNewsClassifier:
    def __init__(self, model_name="bert-base-uncased", num_labels=4):
        self.model_name = model_name
        self.num_labels = num_labels
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        self.model.to(device)

        # Label mapping for AG News
        self.label_map = {
            0: "🌍 World",
            1: "⚽ Sports",
            2: "💼 Business",
            3: "🔬 Sci/Tech"
        }

    def preprocess_function(self, examples):
        """Tokenization function that preserves labels"""
        # Tokenize text
        tokenized = self.tokenizer(
            examples['text'],
            truncation=True,
            padding='max_length',
            max_length=128,
        )
        # Add labels to the output
        tokenized['labels'] = examples['label']
        return tokenized

    def load_data_fast(self, train_samples=3000, test_samples=1000):
        """Load smaller dataset for faster training"""
        print("📊 Loading AG News dataset...")
        dataset = load_dataset("ag_news")

        # Take smaller samples for faster training
        train_dataset = dataset['train'].select(range(min(train_samples, len(dataset['train']))))
        test_dataset = dataset['test'].select(range(min(test_samples, len(dataset['test']))))

        print(f"🔢 Training samples: {len(train_dataset)}")
        print(f"🔢 Test samples: {len(test_dataset)}")

        # Tokenize datasets
        tokenized_train = train_dataset.map(
            self.preprocess_function,
            batched=True,
            batch_size=500,
            remove_columns=['text']
        )

        tokenized_test = test_dataset.map(
            self.preprocess_function,
            batched=True,
            batch_size=500,
            remove_columns=['text']
        )

        # Set format for PyTorch
        tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

        return tokenized_train, tokenized_test

    def train_fast(self, train_dataset, test_dataset, epochs=2):
        """Fast training with progress tracking"""
        # Create data loaders with small batch size
        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

        optimizer = AdamW(self.model.parameters(), lr=2e-5)

        print("🎯 Starting fast training...")
        training_losses = []

        for epoch in range(epochs):
            # Training phase
            self.model.train()
            total_loss = 0
            batch_count = 0

            for batch in train_loader:
                # Move batch to device
                batch = {k: v.to(device) for k, v in batch.items()}

                # Forward pass
                outputs = self.model(**batch)
                loss = outputs.loss

                # Backward pass
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

                total_loss += loss.item()
                batch_count += 1

                # Print progress every 50 batches
                if batch_count % 50 == 0:
                    avg_loss = total_loss / batch_count
                    print(f"Epoch {epoch+1}, Batch {batch_count}: Loss = {avg_loss:.4f}")

            avg_train_loss = total_loss / len(train_loader)
            training_losses.append(avg_train_loss)

            # Quick validation
            accuracy = self.evaluate_fast(test_loader)

            print(f"✅ Epoch {epoch+1}/{epochs} completed!")
            print(f"   Average Loss: {avg_train_loss:.4f}")
            print(f"   Validation Accuracy: {accuracy:.4f}")

        return training_losses

    def evaluate_fast(self, test_loader):
        """Fast evaluation"""
        self.model.eval()
        predictions = []
        true_labels = []

        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = self.model(**batch)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=-1)

                predictions.extend(preds.cpu().numpy())
                true_labels.extend(batch['labels'].cpu().numpy())

        accuracy = accuracy_score(true_labels, predictions)
        f1 = f1_score(true_labels, predictions, average='weighted')

        print(f"📊 Evaluation Results:")
        print(f"   Accuracy: {accuracy:.4f}")
        print(f"   F1-Score: {f1:.4f}")

        return accuracy

    def predict(self, text):
        """Predict topic for news headline"""
        self.model.eval()

        # Tokenize input
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors="pt"
        )

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            prediction = torch.argmax(logits, dim=-1).item()
            probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]

        return {
            'prediction': prediction,
            'label': self.label_map[prediction],
            'probabilities': {
                self.label_map[i]: float(prob) for i, prob in enumerate(probabilities)
            }
        }

    def save_model(self, path="./trained_news_classifier"):
        """Save the trained model"""
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)
        print(f"💾 Model saved to {path}")

# ========== MAIN TRAINING EXECUTION ==========
print("🚀 Starting News Classifier Training...")

# Create classifier
classifier = FastTrainingNewsClassifier()

# Load data (small samples for speed)
print("📊 Loading data for fast training...")
train_data, test_data = classifier.load_data_fast(
    train_samples=3000,  # Small for fast training
    test_samples=1000
)

# Train the model
print("🎯 Training model (this will take 2-5 minutes)...")
start_time = time.time()
training_losses = classifier.train_fast(train_data, test_data, epochs=2)
training_time = time.time() - start_time

print(f"✅ Training completed in {training_time:.1f} seconds!")

# Save the model
classifier.save_model()

# Test predictions
print("\n🧪 Testing trained model...")
test_samples = [
    "Stock market reaches all time high",
    "Football team wins championship",
    "New scientific discovery in space",
    "Company reports record profits"
]

for sample in test_samples:
    result = classifier.predict(sample)
    print(f"📰 '{sample}' -> {result['label']}")

# ========== GRADIO INTERFACE ==========
def classify_news(text):
    """Function for Gradio interface"""
    if not text.strip():
        return "❌ Please enter a news headline", ""

    try:
        start_time = time.time()
        result = classifier.predict(text)
        pred_time = (time.time() - start_time) * 1000

        # Create detailed output
        detailed_output = f"**🎯 Predicted Topic:** {result['label']}\n"
        detailed_output += f"**⚡ Prediction Time:** {pred_time:.0f}ms\n\n"
        detailed_output += "**📊 Confidence Scores:**\n"

        # Sort probabilities by confidence
        sorted_probs = sorted(result['probabilities'].items(), key=lambda x: x[1], reverse=True)

        for category, confidence in sorted_probs:
            bar_length = int(confidence * 20)
            bar = "█" * bar_length + "░" * (20 - bar_length)
            detailed_output += f"{category}: {bar} {confidence:.2%}\n"

        # Simple output
        simple_output = f"Topic: {result['label']} ({result['probabilities'][result['label']]:.2%} confidence)"

        return simple_output, detailed_output
    except Exception as e:
        return f"❌ Error: {str(e)}", ""

print("\n🌐 Launching Gradio Interface...")

with gr.Blocks(theme=gr.themes.Soft(), title="Trained News Classifier") as demo:
    gr.Markdown(
        """
        # 🚀 News Topic Classifier
        **BERT model fine-tuned on AG News dataset**

        🔹 **Categories:** 🌍 World | ⚽ Sports | 💼 Business | 🔬 Sci/Tech
        🔹 **Model:** BERT-base-uncased (fine-tuned by you!)
        🔹 **Training Samples:** 3,000
        🔹 **Accuracy:** ~85-90%
        """
    )

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="📝 Enter News Headline",
                placeholder="Type a news headline here...",
                lines=2,
                max_lines=3
            )

            gr.Examples(
                examples=[
                    ["Stock market reaches all time high today"],
                    ["Football team wins championship in final"],
                    ["New scientific discovery in quantum physics"],
                    ["International peace talks begin next week"],
                    ["Tech company launches new AI smartphone"],
                    ["Basketball player breaks scoring record"]
                ],
                inputs=[input_text],
                label="💡 Try these examples:"
            )

            classify_btn = gr.Button("🎯 Classify Topic", variant="primary", size="lg")

        with gr.Column():
            simple_output = gr.Textbox(
                label="🎯 Prediction Result",
                interactive=False
            )
            detailed_output = gr.Markdown(
                label="📊 Detailed Analysis"
            )

    # Connect the button
    classify_btn.click(
        fn=classify_news,
        inputs=[input_text],
        outputs=[simple_output, detailed_output]
    )

    # Add training info
    with gr.Accordion("ℹ️ Training Information", open=True):
        gr.Markdown(f"""
        **📊 Training Details:**
        - **Base Model**: BERT-base-uncased
        - **Training Samples**: 3,000
        - **Test Samples**: 1,000
        - **Training Time**: {training_time:.1f} seconds
        - **Epochs**: 2
        - **Batch Size**: 8

        **🎯 Model Performance:**
        - **Final Loss**: {training_losses[-1]:.4f}
        - **Expected Accuracy**: 85-90%
        - **Categories**: World, Sports, Business, Sci/Tech

        **⚡ Technical Info:**
        - **Device Used**: {device}
        - **Framework**: PyTorch + Transformers
        - **Tokenizer**: BERT Tokenizer
        - **Max Length**: 128 tokens

        *This model was trained by SAMAR!* 🎉
        """)

# Launch the interface
print("✅ Ready! Launching web interface...")
demo.launch(share=True)

⚡ Using device: cpu
🚀 Starting News Classifier Training...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📊 Loading data for fast training...
📊 Loading AG News dataset...
🔢 Training samples: 3000
🔢 Test samples: 1000


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

🎯 Training model (this will take 2-5 minutes)...
🎯 Starting fast training...
Epoch 1, Batch 50: Loss = 1.0363
Epoch 1, Batch 100: Loss = 0.8042
Epoch 1, Batch 150: Loss = 0.6911
Epoch 1, Batch 200: Loss = 0.6228
Epoch 1, Batch 250: Loss = 0.5652
Epoch 1, Batch 300: Loss = 0.5255
Epoch 1, Batch 350: Loss = 0.5042
📊 Evaluation Results:
   Accuracy: 0.9010
   F1-Score: 0.9003
✅ Epoch 1/2 completed!
   Average Loss: 0.4908
   Validation Accuracy: 0.9010
Epoch 2, Batch 50: Loss = 0.2400
Epoch 2, Batch 100: Loss = 0.2479
Epoch 2, Batch 150: Loss = 0.2354
Epoch 2, Batch 200: Loss = 0.2345
Epoch 2, Batch 250: Loss = 0.2333
Epoch 2, Batch 300: Loss = 0.2214
Epoch 2, Batch 350: Loss = 0.2206
📊 Evaluation Results:
   Accuracy: 0.8670
   F1-Score: 0.8670
✅ Epoch 2/2 completed!
   Average Loss: 0.2218
   Validation Accuracy: 0.8670
✅ Training completed in 8409.3 seconds!
💾 Model saved to ./trained_news_classifier

🧪 Testing trained model...
📰 'Stock market reaches all time high' -> 💼 Business
📰 'Fo

