In [None]:
# ==================================================
# Imports and setup
# ==================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import time
from collections import defaultdict

# Set style for better plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 4)

# Set random seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

print("="*60)
print("BASELINE MLP IMPLEMENTATION")
print("="*60)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print("="*60)

In [None]:
# ==================================================
# Data Loading
# ==================================================

def load_mnist(batch_size=64, data_dir='/kaggle/working/data'):
    """
    Load MNIST with proper normalization 

    Args: 
        batch_size: Batch size for training -> 64
        data_dir: where to download MNIST data

    Returns:
        train_loader: DatalLoader for training
        test_loader: DataLoader for testing
    """
    print("\n Loading MNIST Dataset")

    # MNIST normalization values (mean=0.1307, std=0.3081)
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

    # Download and load training data
    train_dataset = datasets.MNIST(
        root=data_dir,
        train=True,
        download=True,
        transform=transform
    )

    # Download and load test data
    test_dataset = datasets.MNIST(
        root=data_dir,
        train=False,
        download=True,
        transform=transform
    )

    # Create data loaders (pin_memory=True for faster GPU transfers)
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )

    print(f"Tranining samples: {len(train_dataset):,}")
    print(f"Test samples: {len(test_dataset):,}")
    print(f"Batch size: {batch_size}")
    print(f"Training batches: {len(train_loader)}")

    return train_loader, test_loader

In [None]:
# ==================================================
# Baseline MLP Model
# ==================================================

class BaselineMLP(nn.Module):
    """
    Simple 2-layer MLP for MNIST classification

    This is the performance baseline - Moe should match or beat this

    Architechture:
        Input (784) -> FC1(128) -> ReLU -> FC2(64) -> ReLU -> FC3(10)
    """
    def __init__(self, input_dim=784, hidden_dim=128, output_dim=10):
        super(BaselineMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, output_dim)
        self.relu = nn.ReLU()
        self._initialize_weights()

    def _initialize_weights(self):
        """
        Xavier initialization for better training
        """
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        """
        Forward pass

        Args: 
            x: Input tensor (batch, 28, 28) or (batch, 784)

        Returns: 
            Logits for 10 classes (batch, 10)
        """
        # Flatten image (batch, 28, 28) -> (batch, 784)
        if x.dim() > 2:
            x = x.view(x.size(0), -1)

        # Forward through the layers
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)

        return x

    def count_parameters(self):
        """
        Count trainable parameters
        """
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [None]:
# ==================================================
# Training Function 
# ==================================================

def train_epoch(model, train_loader, optimizer, criterion, device, epoch):
    """
    Train for one epoch

    Args:
        model: The neural network
        train_loader: Training data
        optimizer: Optimizer (Adam)
        criterion: Loss function (CrossEntropy)
        device: 'cuda' or 'cpu'
        epoch: Current epoch number

    Returns: 
        avg_loss: Average training loss for the epoch 
    """
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    # Progress bar
    pbar = tqdm(train_loader, desc=f'Epoch {epoch}')

    for batch_idx, (data, target) in enumerate(pbar):
        # Move to GPU
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += (pred == target).sum().item()
        total += target.size(0)

        # Update progress bar
        pbar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{100.*correct/total:.2f}%'
        })

    avg_loss = total_loss / len(train_loader)
    train_acc = 100. * correct / total

    return avg_loss, train_acc

def evaluate(model, test_loader, criterion, device):
    """
    Evaluate model on test set

    Args:
        model: The neural network
        test_loader: Test data
        criterion: Loss function
        device: 'cuda' or 'cpu'

    Returns:
        test_loss: Average test loss
        test_acc: Test accuracy(%)
    """
    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)

            output = model(data)
            test_loss += criterion(output, target).item()

            pred = output.argmax(dim=1)
            correct += (pred == target).sum().item()
            total += target.size(0)

    test_loss /= len(test_loader)
    test_acc = 100.*correct / total

    return test_loss, test_acc

def train_baseline(model, train_loader, test_loader, epochs=10, lr=0.001, device='cuda'):
    """
    Complete training loop

    Args: 
        model: BaselineMLP instance
        train_loader: Training data
        test_loader: Test data
        epochs: Number of epochs
        lr: Learning rate
        device: 'cuda' or 'cpu'

    Returns:
        history: Dictionary with training metrics
    """
    print(f"STARTING TRAINING...")
    print(f"Model parameters: {model.count_parameters():,}")
    print(f"Device: {device}")
    print(f"Epochs: {epochs}")
    print(f"Learning rate: {lr}")
    print("-" * 60)

    # Setup
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # Move model to GPU
    model = model.to(device)
    
    # Track history
    history = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc': [],
        'epoch_times': []
    }
    
    # Training loop
    best_acc = 0
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        
        # Train
        train_loss, train_acc = train_epoch(
            model, train_loader, optimizer, criterion, device, epoch
        )
        
        # Evaluate
        test_loss, test_acc = evaluate(model, test_loader, criterion, device)
        
        # Track time
        epoch_time = time.time() - start_time
        
        # Save history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['test_loss'].append(test_loss)
        history['test_acc'].append(test_acc)
        history['epoch_times'].append(epoch_time)
        
        # Print summary
        print(f"\nEpoch {epoch}/{epochs} Summary:")
        print(f"   Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
        print(f"   Test Loss:  {test_loss:.4f} | Test Acc:  {test_acc:.2f}%")
        print(f"   Time: {epoch_time:.1f}s")
        
        # Save best model
        if test_acc > best_acc:
            best_acc = test_acc
            print(f"New best accuracy!!!")
        
        print("-" * 60)
    
    # Final summary
    print("\nTraining Complete!!!")
    print(f"Final Test Accuracy: {history['test_acc'][-1]:.2f}%")
    print(f"Best Test Accuracy: {best_acc:.2f}%")
    print(f"Target: 98%")
    
    if history['test_acc'][-1] >= 97:
        print("SUCCESS: Baseline achieved target!")
    else:
        print("Need improvement: Try training longer or adjusting hyperparameters")
    
    return history

In [None]:
# ================================================================    
# VISUALIZATION
# ================================================================

def plot_training_curves(history, save_path='/kaggle/working/baseline_results.png'):
    """
    Create comprehensive training visualization.
    
    Args:
        history: Training history dictionary
        save_path: Where to save the plot
    """
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    epochs = range(1, len(history['train_loss']) + 1)
    
    # Plot 1: Loss curves
    axes[0].plot(epochs, history['train_loss'], 'b-o', label='Train Loss', linewidth=2, markersize=6)
    axes[0].plot(epochs, history['test_loss'], 'r-s', label='Test Loss', linewidth=2, markersize=6)
    axes[0].set_xlabel('Epoch', fontsize=12)
    axes[0].set_ylabel('Loss', fontsize=12)
    axes[0].set_title('Training & Test Loss', fontsize=14, fontweight='bold')
    axes[0].legend(fontsize=11)
    axes[0].grid(True, alpha=0.3)
    
    # Plot 2: Accuracy curves
    axes[1].plot(epochs, history['train_acc'], 'b-o', label='Train Acc', linewidth=2, markersize=6)
    axes[1].plot(epochs, history['test_acc'], 'r-s', label='Test Acc', linewidth=2, markersize=6)
    axes[1].axhline(y=98, color='green', linestyle='--', label='Target (98%)', alpha=0.7, linewidth=2)
    axes[1].set_xlabel('Epoch', fontsize=12)
    axes[1].set_ylabel('Accuracy (%)', fontsize=12)
    axes[1].set_title('Training & Test Accuracy', fontsize=14, fontweight='bold')
    axes[1].legend(fontsize=11)
    axes[1].grid(True, alpha=0.3)
    axes[1].set_ylim([90, 100])
    
    # Plot 3: Training time per epoch
    axes[2].bar(epochs, history['epoch_times'], color='skyblue', alpha=0.7, edgecolor='navy')
    axes[2].set_xlabel('Epoch', fontsize=12)
    axes[2].set_ylabel('Time (seconds)', fontsize=12)
    axes[2].set_title('Training Time Per Epoch', fontsize=14, fontweight='bold')
    axes[2].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"\nPlot saved to: {save_path}")
    plt.show()


def print_final_summary(history):
    """Print comprehensive summary of training."""
    print("\n" + "="*60)
    print("BASELINE MLP - FINAL SUMMARY")
    print("="*60)
    
    print(f"\nPerformance Metrics:")
    print(f"   Final Test Accuracy:  {history['test_acc'][-1]:.2f}%")
    print(f"   Best Test Accuracy:   {max(history['test_acc']):.2f}%")
    print(f"   Final Train Accuracy: {history['train_acc'][-1]:.2f}%")
    print(f"   Final Test Loss:      {history['test_loss'][-1]:.4f}")
    
    print(f"\nTraining Time:")
    print(f"   Total time:           {sum(history['epoch_times']):.1f}s")
    print(f"   Avg time per epoch:   {np.mean(history['epoch_times']):.1f}s")
    
    print(f"\nDay 9 Checklist:")
    print(f"   [{'✓' if history['test_acc'][-1] >= 97 else '✗'}] Accuracy >= 97%")
    print(f"   [✓] Model implemented")
    print(f"   [✓] Training pipeline working")
    print(f"   [✓] Visualization created")
    
    print("\n" + "="*60)

In [None]:
# ================================================================
# MAIN EXECUTION
# ================================================================

# Configuration
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.001
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Step 1: Load data
train_loader, test_loader = load_mnist(batch_size=BATCH_SIZE)

# Step 2: Create model
print("\nBuilding Model...")
model = BaselineMLP(input_dim=784, hidden_dim=128, output_dim=10)
print(f"✓ Model created with {model.count_parameters():,} parameters")

# Step 3: Train model
history = train_baseline(
    model=model,
    train_loader=train_loader,
    test_loader=test_loader,
    epochs=EPOCHS,
    lr=LEARNING_RATE,
    device=DEVICE
)

# Step 4: Visualize results
plot_training_curves(history)

# Step 5: Print final summary
print_final_summary(history)

# Step 6: Save model
model_path = '/kaggle/working/baseline_mlp.pth'
torch.save({
    'model_state_dict': model.state_dict(),
    'history': history,
    'config': {
        'input_dim': 784,
        'hidden_dim': 128,
        'output_dim': 10,
        'epochs': EPOCHS,
        'lr': LEARNING_RATE
    }
}, model_path)
print(f"\nModel saved to: {model_path}")