In [None]:
# Optimized MNIST CNN - Target: 99.4% accuracy with <20k parameters in <20 epochs
# Required: Batch Normalization, Dropout, Global Average Pooling

from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import time

# Install required packages
!pip install torchsummary
from torchsummary import summary

# Device configuration
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f"Using device: {device}")

# Model Definitions - All 5 Models

# Model 1: Original OptimizedNet (Baseline)
class OptimizedNet(nn.Module):
    """
    Original optimized architecture
    - Minimal channels for parameter efficiency
    - Strategic dropout placement
    - Global Average Pooling for parameter reduction
    """
    def __init__(self, dropout_rate=0.1):
        super(OptimizedNet, self).__init__()
        
        # Block 1: Initial feature extraction (minimal channels)
        self.conv1 = nn.Conv2d(1, 8, 3, padding=1)  # 28x28 -> 28x28
        self.bn1 = nn.BatchNorm2d(8)
        self.dropout1 = nn.Dropout2d(dropout_rate)
        
        # Block 2: Feature expansion (still minimal)
        self.conv2 = nn.Conv2d(8, 16, 3, padding=1)  # 28x28 -> 28x28
        self.bn2 = nn.BatchNorm2d(16)
        self.dropout2 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition1 = nn.Conv2d(16, 8, 1)  # 28x28 -> 28x28
        self.bn_trans1 = nn.BatchNorm2d(8)
        
        # MaxPool after transition
        self.pool1 = nn.MaxPool2d(2, 2)  # 28x28 -> 14x14
        
        # Block 3: Mid-level features (moderate channels)
        self.conv3 = nn.Conv2d(8, 16, 3, padding=1)  # 14x14 -> 14x14
        self.bn3 = nn.BatchNorm2d(16)
        self.dropout3 = nn.Dropout2d(dropout_rate)
        
        # Block 4: Feature expansion (moderate channels)
        self.conv4 = nn.Conv2d(16, 32, 3, padding=1)  # 14x14 -> 14x14
        self.bn4 = nn.BatchNorm2d(32)
        self.dropout4 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition2 = nn.Conv2d(32, 16, 1)  # 14x14 -> 14x14
        self.bn_trans2 = nn.BatchNorm2d(16)
        
        # MaxPool after transition
        self.pool2 = nn.MaxPool2d(2, 2)  # 14x14 -> 7x7
        
        # Block 5: High-level features (moderate channels)
        self.conv5 = nn.Conv2d(16, 32, 3, padding=1)  # 7x7 -> 7x7
        self.bn5 = nn.BatchNorm2d(32)
        self.dropout5 = nn.Dropout2d(dropout_rate)
        
        # Block 6: Final feature extraction (reduce for GAP)
        self.conv6 = nn.Conv2d(32, 16, 3, padding=1)  # 7x7 -> 7x7
        self.bn6 = nn.BatchNorm2d(16)
        self.dropout6 = nn.Dropout2d(dropout_rate)
        
        # Global Average Pooling
        self.gap = nn.AdaptiveAvgPool2d(1)  # 7x7 -> 1x1
        
        # Final classifier
        self.fc = nn.Linear(16, 10)
        
    def forward(self, x):
        # Block 1
        x = self.dropout1(F.relu(self.bn1(self.conv1(x))))
        
        # Block 2
        x = self.dropout2(F.relu(self.bn2(self.conv2(x))))
        
        # Transition 1
        x = F.relu(self.bn_trans1(self.transition1(x)))
        x = self.pool1(x)
        
        # Block 3
        x = self.dropout3(F.relu(self.bn3(self.conv3(x))))
        
        # Block 4
        x = self.dropout4(F.relu(self.bn4(self.conv4(x))))
        
        # Transition 2
        x = F.relu(self.bn_trans2(self.transition2(x)))
        x = self.pool2(x)
        
        # Block 5
        x = self.dropout5(F.relu(self.bn5(self.conv5(x))))
        
        # Block 6
        x = self.dropout6(F.relu(self.bn6(self.conv6(x))))
        
        # Global Average Pooling
        x = self.gap(x)
        x = x.view(x.size(0), -1)
        
        # Final classification
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


# Model 2: Improved Architecture (Winner)
class OptimizedNetV2(nn.Module):
    """
    Improved architecture with better channel progression
    - Better initial learning dynamics
    - Controlled capacity for parameter constraint
    - Optimized dropout rates
    """
    def __init__(self, dropout_rate=0.05):
        super(OptimizedNetV2, self).__init__()
        
        # Block 1: Better initial feature extraction (reduced channels)
        self.conv1 = nn.Conv2d(1, 10, 3, padding=1)  # 28x28 -> 28x28
        self.bn1 = nn.BatchNorm2d(10)
        self.dropout1 = nn.Dropout2d(dropout_rate)
        
        # Block 2: Feature expansion with controlled capacity
        self.conv2 = nn.Conv2d(10, 16, 3, padding=1)  # 28x28 -> 28x28
        self.bn2 = nn.BatchNorm2d(16)
        self.dropout2 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition1 = nn.Conv2d(16, 10, 1)  # 28x28 -> 28x28
        self.bn_trans1 = nn.BatchNorm2d(10)
        
        # MaxPool after transition
        self.pool1 = nn.MaxPool2d(2, 2)  # 28x28 -> 14x14
        
        # Block 3: Mid-level features with controlled capacity
        self.conv3 = nn.Conv2d(10, 20, 3, padding=1)  # 14x14 -> 14x14
        self.bn3 = nn.BatchNorm2d(20)
        self.dropout3 = nn.Dropout2d(dropout_rate)
        
        # Block 4: Feature expansion
        self.conv4 = nn.Conv2d(20, 28, 3, padding=1)  # 14x14 -> 14x14
        self.bn4 = nn.BatchNorm2d(28)
        self.dropout4 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition2 = nn.Conv2d(28, 16, 1)  # 14x14 -> 14x14
        self.bn_trans2 = nn.BatchNorm2d(16)
        
        # MaxPool after transition
        self.pool2 = nn.MaxPool2d(2, 2)  # 14x14 -> 7x7
        
        # Block 5: High-level features
        self.conv5 = nn.Conv2d(16, 24, 3, padding=1)  # 7x7 -> 7x7
        self.bn5 = nn.BatchNorm2d(24)
        self.dropout5 = nn.Dropout2d(dropout_rate)
        
        # Block 6: Final feature extraction
        self.conv6 = nn.Conv2d(24, 16, 3, padding=1)  # 7x7 -> 7x7
        self.bn6 = nn.BatchNorm2d(16)
        self.dropout6 = nn.Dropout2d(dropout_rate)
        
        # Global Average Pooling
        self.gap = nn.AdaptiveAvgPool2d(1)  # 7x7 -> 1x1
        
        # Final classifier
        self.fc = nn.Linear(16, 10)
        
    def forward(self, x):
        # Block 1
        x = self.dropout1(F.relu(self.bn1(self.conv1(x))))
        
        # Block 2
        x = self.dropout2(F.relu(self.bn2(self.conv2(x))))
        
        # Transition 1
        x = F.relu(self.bn_trans1(self.transition1(x)))
        x = self.pool1(x)
        
        # Block 3
        x = self.dropout3(F.relu(self.bn3(self.conv3(x))))
        
        # Block 4
        x = self.dropout4(F.relu(self.bn4(self.conv4(x))))
        
        # Transition 2
        x = F.relu(self.bn_trans2(self.transition2(x)))
        x = self.pool2(x)
        
        # Block 5
        x = self.dropout5(F.relu(self.bn5(self.conv5(x))))
        
        # Block 6
        x = self.dropout6(F.relu(self.bn6(self.conv6(x))))
        
        # Global Average Pooling
        x = self.gap(x)
        x = x.view(x.size(0), -1)
        
        # Final classification
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


# Model 3: Balanced Architecture with SGD Optimization
class OptimizedNetV3(nn.Module):
    """
    Balanced architecture optimized for SGD training
    - Moderate dropout rates (0.01-0.1)
    - Better learning dynamics
    - Optimized for SGD optimizer
    """
    def __init__(self, dropout_rate=0.02):
        super(OptimizedNetV3, self).__init__()
        
        # Block 1: Initial feature extraction
        self.conv1 = nn.Conv2d(1, 10, 3, padding=1)  # 28x28 -> 28x28
        self.bn1 = nn.BatchNorm2d(10)
        self.dropout1 = nn.Dropout2d(dropout_rate)
        
        # Block 2: Feature expansion
        self.conv2 = nn.Conv2d(10, 16, 3, padding=1)  # 28x28 -> 28x28
        self.bn2 = nn.BatchNorm2d(16)
        self.dropout2 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition1 = nn.Conv2d(16, 10, 1)  # 28x28 -> 28x28
        self.bn_trans1 = nn.BatchNorm2d(10)
        
        # MaxPool after transition
        self.pool1 = nn.MaxPool2d(2, 2)  # 28x28 -> 14x14
        
        # Block 3: Mid-level features
        self.conv3 = nn.Conv2d(10, 20, 3, padding=1)  # 14x14 -> 14x14
        self.bn3 = nn.BatchNorm2d(20)
        self.dropout3 = nn.Dropout2d(dropout_rate)
        
        # Block 4: Feature expansion
        self.conv4 = nn.Conv2d(20, 32, 3, padding=1)  # 14x14 -> 14x14
        self.bn4 = nn.BatchNorm2d(32)
        self.dropout4 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition2 = nn.Conv2d(32, 16, 1)  # 14x14 -> 14x14
        self.bn_trans2 = nn.BatchNorm2d(16)
        
        # MaxPool after transition
        self.pool2 = nn.MaxPool2d(2, 2)  # 14x14 -> 7x7
        
        # Block 5: High-level features
        self.conv5 = nn.Conv2d(16, 24, 3, padding=1)  # 7x7 -> 7x7
        self.bn5 = nn.BatchNorm2d(24)
        self.dropout5 = nn.Dropout2d(dropout_rate)
        
        # Block 6: Final feature extraction
        self.conv6 = nn.Conv2d(24, 16, 3, padding=1)  # 7x7 -> 7x7
        self.bn6 = nn.BatchNorm2d(16)
        self.dropout6 = nn.Dropout2d(dropout_rate)
        
        # Global Average Pooling
        self.gap = nn.AdaptiveAvgPool2d(1)  # 7x7 -> 1x1
        
        # Final classifier
        self.fc = nn.Linear(16, 10)
        
    def forward(self, x):
        # Block 1
        x = self.dropout1(F.relu(self.bn1(self.conv1(x))))
        
        # Block 2
        x = self.dropout2(F.relu(self.bn2(self.conv2(x))))
        
        # Transition 1
        x = F.relu(self.bn_trans1(self.transition1(x)))
        x = self.pool1(x)
        
        # Block 3
        x = self.dropout3(F.relu(self.bn3(self.conv3(x))))
        
        # Block 4
        x = self.dropout4(F.relu(self.bn4(self.conv4(x))))
        
        # Transition 2
        x = F.relu(self.bn_trans2(self.transition2(x)))
        x = self.pool2(x)
        
        # Block 5
        x = self.dropout5(F.relu(self.bn5(self.conv5(x))))
        
        # Block 6
        x = self.dropout6(F.relu(self.bn6(self.conv6(x))))
        
        # Global Average Pooling
        x = self.gap(x)
        x = x.view(x.size(0), -1)
        
        # Final classification
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


# Model 4: Model1 Architecture with SGD Optimization
class OptimizedNetV4(nn.Module):
    """
    Model1 architecture optimized for SGD training
    - Same architecture as Model1 (8→16→32→16)
    - Minimal dropout (0.02) for SGD compatibility
    - Optimized for SGD with momentum
    """
    def __init__(self, dropout_rate=0.02):
        super(OptimizedNetV4, self).__init__()
        
        # Block 1: Initial feature extraction (minimal channels)
        self.conv1 = nn.Conv2d(1, 8, 3, padding=1)  # 28x28 -> 28x28
        self.bn1 = nn.BatchNorm2d(8)
        self.dropout1 = nn.Dropout2d(dropout_rate)
        
        # Block 2: Feature expansion (still minimal)
        self.conv2 = nn.Conv2d(8, 16, 3, padding=1)  # 28x28 -> 28x28
        self.bn2 = nn.BatchNorm2d(16)
        self.dropout2 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition1 = nn.Conv2d(16, 8, 1)  # 28x28 -> 28x28
        self.bn_trans1 = nn.BatchNorm2d(8)
        
        # MaxPool after transition
        self.pool1 = nn.MaxPool2d(2, 2)  # 28x28 -> 14x14
        
        # Block 3: Mid-level features (moderate channels)
        self.conv3 = nn.Conv2d(8, 16, 3, padding=1)  # 14x14 -> 14x14
        self.bn3 = nn.BatchNorm2d(16)
        self.dropout3 = nn.Dropout2d(dropout_rate)
        
        # Block 4: Feature expansion (moderate channels)
        self.conv4 = nn.Conv2d(16, 32, 3, padding=1)  # 14x14 -> 14x14
        self.bn4 = nn.BatchNorm2d(32)
        self.dropout4 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition2 = nn.Conv2d(32, 16, 1)  # 14x14 -> 14x14
        self.bn_trans2 = nn.BatchNorm2d(16)
        
        # MaxPool after transition
        self.pool2 = nn.MaxPool2d(2, 2)  # 14x14 -> 7x7
        
        # Block 5: High-level features (moderate channels)
        self.conv5 = nn.Conv2d(16, 32, 3, padding=1)  # 7x7 -> 7x7
        self.bn5 = nn.BatchNorm2d(32)
        self.dropout5 = nn.Dropout2d(dropout_rate)
        
        # Block 6: Final feature extraction (reduce for GAP)
        self.conv6 = nn.Conv2d(32, 16, 3, padding=1)  # 7x7 -> 7x7
        self.bn6 = nn.BatchNorm2d(16)
        self.dropout6 = nn.Dropout2d(dropout_rate)
        
        # Global Average Pooling
        self.gap = nn.AdaptiveAvgPool2d(1)  # 7x7 -> 1x1
        
        # Final classifier
        self.fc = nn.Linear(16, 10)
        
    def forward(self, x):
        # Block 1
        x = self.dropout1(F.relu(self.bn1(self.conv1(x))))
        
        # Block 2
        x = self.dropout2(F.relu(self.bn2(self.conv2(x))))
        
        # Transition 1
        x = F.relu(self.bn_trans1(self.transition1(x)))
        x = self.pool1(x)
        
        # Block 3
        x = self.dropout3(F.relu(self.bn3(self.conv3(x))))
        
        # Block 4
        x = self.dropout4(F.relu(self.bn4(self.conv4(x))))
        
        # Transition 2
        x = F.relu(self.bn_trans2(self.transition2(x)))
        x = self.pool2(x)
        
        # Block 5
        x = self.dropout5(F.relu(self.bn5(self.conv5(x))))
        
        # Block 6
        x = self.dropout6(F.relu(self.bn6(self.conv6(x))))
        
        # Global Average Pooling
        x = self.gap(x)
        x = x.view(x.size(0), -1)
        
        # Final classification
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


# Model 5: Model2 Architecture with SGD Optimization
class OptimizedNetV5(nn.Module):
    """
    Model2 architecture optimized for SGD training
    - Same architecture as Model2 (10→16→28→16)
    - Minimal dropout (0.02) for SGD compatibility
    - Optimized for SGD with momentum
    """
    def __init__(self, dropout_rate=0.02):
        super(OptimizedNetV5, self).__init__()
        
        # Block 1: Better initial feature extraction (reduced channels)
        self.conv1 = nn.Conv2d(1, 10, 3, padding=1)  # 28x28 -> 28x28
        self.bn1 = nn.BatchNorm2d(10)
        self.dropout1 = nn.Dropout2d(dropout_rate)
        
        # Block 2: Feature expansion with controlled capacity
        self.conv2 = nn.Conv2d(10, 16, 3, padding=1)  # 28x28 -> 28x28
        self.bn2 = nn.BatchNorm2d(16)
        self.dropout2 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition1 = nn.Conv2d(16, 10, 1)  # 28x28 -> 28x28
        self.bn_trans1 = nn.BatchNorm2d(10)
        
        # MaxPool after transition
        self.pool1 = nn.MaxPool2d(2, 2)  # 28x28 -> 14x14
        
        # Block 3: Mid-level features with controlled capacity
        self.conv3 = nn.Conv2d(10, 20, 3, padding=1)  # 14x14 -> 14x14
        self.bn3 = nn.BatchNorm2d(20)
        self.dropout3 = nn.Dropout2d(dropout_rate)
        
        # Block 4: Feature expansion
        self.conv4 = nn.Conv2d(20, 28, 3, padding=1)  # 14x14 -> 14x14
        self.bn4 = nn.BatchNorm2d(28)
        self.dropout4 = nn.Dropout2d(dropout_rate)
        
        # Transition layer: 1x1 conv for dimensionality reduction
        self.transition2 = nn.Conv2d(28, 16, 1)  # 14x14 -> 14x14
        self.bn_trans2 = nn.BatchNorm2d(16)
        
        # MaxPool after transition
        self.pool2 = nn.MaxPool2d(2, 2)  # 14x14 -> 7x7
        
        # Block 5: High-level features
        self.conv5 = nn.Conv2d(16, 24, 3, padding=1)  # 7x7 -> 7x7
        self.bn5 = nn.BatchNorm2d(24)
        self.dropout5 = nn.Dropout2d(dropout_rate)
        
        # Block 6: Final feature extraction
        self.conv6 = nn.Conv2d(24, 16, 3, padding=1)  # 7x7 -> 7x7
        self.bn6 = nn.BatchNorm2d(16)
        self.dropout6 = nn.Dropout2d(dropout_rate)
        
        # Global Average Pooling
        self.gap = nn.AdaptiveAvgPool2d(1)  # 7x7 -> 1x1
        
        # Final classifier
        self.fc = nn.Linear(16, 10)
        
    def forward(self, x):
        # Block 1
        x = self.dropout1(F.relu(self.bn1(self.conv1(x))))
        
        # Block 2
        x = self.dropout2(F.relu(self.bn2(self.conv2(x))))
        
        # Transition 1
        x = F.relu(self.bn_trans1(self.transition1(x)))
        x = self.pool1(x)
        
        # Block 3
        x = self.dropout3(F.relu(self.bn3(self.conv3(x))))
        
        # Block 4
        x = self.dropout4(F.relu(self.bn4(self.conv4(x))))
        
        # Transition 2
        x = F.relu(self.bn_trans2(self.transition2(x)))
        x = self.pool2(x)
        
        # Block 5
        x = self.dropout5(F.relu(self.bn5(self.conv5(x))))
        
        # Block 6
        x = self.dropout6(F.relu(self.bn6(self.conv6(x))))
        
        # Global Average Pooling
        x = self.gap(x)
        x = x.view(x.size(0), -1)
        
        # Final classification
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


In [None]:
# Model Selection and Configuration
MODEL_NAME = "Model1"  # Options: "Model1", "Model2", "Model3", "Model4", "Model5"

def get_model(model_name):
    """Get model based on name"""
    if model_name == "Model1":
        return OptimizedNet(dropout_rate=0.1)
    elif model_name == "Model2":
        return OptimizedNetV2(dropout_rate=0.05)
    elif model_name == "Model3":
        return OptimizedNetV3(dropout_rate=0.02)
    elif model_name == "Model4":
        return OptimizedNetV4(dropout_rate=0.02)
    elif model_name == "Model5":
        return OptimizedNetV5(dropout_rate=0.02)
    else:
        raise ValueError(f"Unknown model: {model_name}")

# Create and test the selected model
model = get_model(MODEL_NAME).to(device)
print(f"\n" + "="*60)
print(f"SELECTED MODEL: {MODEL_NAME}")
print("="*60)

if MODEL_NAME == "Model1":
    print("Baseline Model - Original OptimizedNet (WORKING)")
    print("- Dropout: 0.1")
    print("- Channels: 8→16→32→16")
    print("- Status: Achieved 99.40% in 16 epochs")
    print("- Optimizer: Adam")
elif MODEL_NAME == "Model2":
    print("Improved Architecture - OptimizedNetV2 (WINNER)")
    print("- Dropout: 0.05 (reduced for better learning)")
    print("- Channels: 10→16→28→16 (controlled capacity)")
    print("- Better initial learning dynamics")
    print("- Optimizer: Adam")
elif MODEL_NAME == "Model3":
    print("Balanced Architecture - OptimizedNetV3 (SGD Optimized)")
    print("- Dropout: 0.02 (minimal for SGD)")
    print("- Channels: 10→16→32→16 (balanced)")
    print("- Optimized for SGD training")
    print("- Optimizer: SGD")
elif MODEL_NAME == "Model4":
    print("Model1 Architecture with SGD Optimization")
    print("- Dropout: 0.02 (minimal for SGD)")
    print("- Channels: 8→16→32→16 (same as Model1)")
    print("- Optimizer: SGD")
    print("- Tests SGD with minimal architecture")
elif MODEL_NAME == "Model5":
    print("Model2 Architecture with SGD Optimization (RECOMMENDED)")
    print("- Dropout: 0.02 (minimal for SGD)")
    print("- Channels: 10→16→28→16 (same as Model2)")
    print("- Optimizer: SGD")
    print("- Tests SGD with optimal architecture - POTENTIAL ULTIMATE WINNER")

summary(model, input_size=(1, 28, 28))

# Count total parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nTotal Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")
print(f"Parameter Constraint: {'✓ PASS' if total_params < 20000 else '✗ FAIL'} (< 20,000)")


In [None]:
# Data Loading and Augmentation Analysis
print("="*60)
print("DATA LOADING AND AUGMENTATION ANALYSIS")
print("="*60)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Data loading parameters
batch_size = 128
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

# Data augmentation and normalization
# Note: We're NOT using data augmentation in this project
# Only basic normalization is applied
train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST normalization
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Optional: Data augmentation (commented out)
# train_transform_augmented = transforms.Compose([
#     transforms.RandomRotation(degrees=5),
#     transforms.RandomAffine(degrees=0, translate=(0.05, 0.05)),
#     transforms.ToTensor(),
#     transforms.Normalize((0.1307,), (0.3081,))
# ])

# Create data loaders
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True, transform=train_transform),
    batch_size=batch_size, shuffle=True, **kwargs)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=test_transform),
    batch_size=batch_size, shuffle=False, **kwargs)

print("Data Augmentation Status:")
print("❌ NO DATA AUGMENTATION USED")
print("✅ Only basic normalization applied")
print(f"\nBatch Size: {batch_size}")
print(f"Train Samples: {len(train_loader.dataset):,}")
print(f"Test Samples: {len(test_loader.dataset):,}")
print("="*60)


In [None]:
# Training and Testing Functions
def train(model, device, train_loader, optimizer, epoch, scheduler=None):
    """Training function with progress tracking"""
    model.train()
    train_loss = 0
    correct = 0
    processed = 0
    
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        
        if scheduler:
            scheduler.step()
        
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        processed += len(data)
        
        pbar.set_description(desc=f'Epoch {epoch}: Loss={loss.item():.4f}, Acc={100.*correct/processed:.2f}%, LR={optimizer.param_groups[0]["lr"]:.6f}')
    
    avg_loss = train_loss / len(train_loader)
    accuracy = 100. * correct / processed
    return avg_loss, accuracy

def test(model, device, test_loader):
    """Testing function"""
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, accuracy

def plot_training_history(train_losses, train_accs, test_losses, test_accs):
    """Plot training and validation curves"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Loss plot
    ax1.plot(train_losses, label='Train Loss', color='blue')
    ax1.plot(test_losses, label='Test Loss', color='red')
    ax1.set_title('Training and Test Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)
    
    # Accuracy plot
    ax2.plot(train_accs, label='Train Accuracy', color='blue')
    ax2.plot(test_accs, label='Test Accuracy', color='red')
    ax2.axhline(y=99.4, color='green', linestyle='--', label='Target (99.4%)')
    ax2.set_title('Training and Test Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy (%)')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    plt.show()

print("Training and testing functions defined successfully!")


In [None]:
# Training Loop with Optimizer Configuration
print("="*60)
print("TRAINING CONFIGURATION")
print("="*60)

# Optimizer configuration based on model
if MODEL_NAME in ["Model1", "Model2"]:
    # Adam optimizers for original models
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    max_lr = 0.01
    pct_start = 0.3
    opt_name = "Adam"
elif MODEL_NAME in ["Model3", "Model4", "Model5"]:
    # SGD optimizers for SGD-optimized models
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
    max_lr = 0.05
    pct_start = 0.3
    opt_name = "SGD"
else:
    raise ValueError(f"Unknown model: {MODEL_NAME}")

# Learning rate scheduler
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer, 
    max_lr=max_lr, 
    epochs=20, 
    steps_per_epoch=len(train_loader),
    pct_start=pct_start,
    anneal_strategy='cos'
)

# Training parameters
num_epochs = 20
target_accuracy = 99.4

print(f"Model: {MODEL_NAME}")
print(f"Optimizer: {opt_name} (lr={optimizer.param_groups[0]['lr']}, weight_decay={optimizer.param_groups[0]['weight_decay']})")
print(f"Scheduler: OneCycleLR (max_lr={max_lr}, pct_start={pct_start})")
print(f"Epochs: {num_epochs}")
print(f"Target Accuracy: {target_accuracy}%")
print(f"Parameter Count: {sum(p.numel() for p in model.parameters()):,}")
print("="*60)

# Training history
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

# Training loop
start_time = time.time()
best_accuracy = 0
best_epoch = 0

for epoch in range(1, num_epochs + 1):
    print(f"\nEpoch {epoch}/{num_epochs}")
    print("-" * 50)
    
    # Training
    train_loss, train_acc = train(model, device, train_loader, optimizer, epoch, scheduler)
    
    # Testing
    test_loss, test_acc = test(model, device, test_loader)
    
    # Store results
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    test_losses.append(test_loss)
    test_accuracies.append(test_acc)
    
    # Check for best accuracy
    if test_acc > best_accuracy:
        best_accuracy = test_acc
        best_epoch = epoch
        torch.save(model.state_dict(), 'best_model.pth')
    
    # Print epoch summary
    status = "✓ ACHIEVED" if test_acc >= target_accuracy else "✗ NOT ACHIEVED"
    print(f"Epoch {epoch} Summary:")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Valid Loss: {test_loss:.4f} | Valid Acc: {test_acc:.2f}%")
    print(f"Target: {target_accuracy}% | Status: {status}")
    
    if test_acc >= target_accuracy:
        print(f"🎯 New Best Validation Accuracy: {test_acc:.2f}%")
        print(f"🎉 TARGET ACHIEVED! Validation Accuracy: {test_acc:.2f}% >= {target_accuracy}%")
        print("Target reached, but continuing training for full analysis...")
    
    print(f"Best Valid Acc so far: {best_accuracy:.2f}% (epoch {best_epoch})")
    print("-" * 50)

# Training completed
end_time = time.time()
training_time = end_time - start_time
print(f"\nTraining completed in {training_time:.2f} seconds")
print(f"Final best accuracy: {best_accuracy:.2f}% at epoch {best_epoch}")

# Plot training history
plot_training_history(train_losses, train_accuracies, test_losses, test_accuracies)


In [None]:
# Final Analysis and Model Comparison
print("="*60)
print("FINAL MODEL ANALYSIS")
print("="*60)

# Load best model
model.load_state_dict(torch.load('best_model.pth'))

# Final evaluation on test set only
final_test_loss, final_test_acc = test(model, device, test_loader)

# Get final training metrics from the last epoch (not by running test on train set)
final_train_loss = train_losses[-1]  # Last epoch training loss
final_train_acc = train_accuracies[-1]  # Last epoch training accuracy

print("Final evaluation results:")
print(f"Final Training Metrics: Train Loss: {final_train_loss:.4f} | Train Acc: {final_train_acc:.2f}%")
print(f"Final Validation Metrics: Valid Loss: {final_test_loss:.4f} | Valid Acc: {final_test_acc:.2f}%")

# Train-Val Gap Analysis
train_val_gap = final_train_acc - final_test_acc
gap_status = "✓ GOOD" if abs(train_val_gap) <= 0.3 else "✗ HIGH"
print(f"\nTrain-Val Gap Analysis:")
print(f"Train-Val Gap: {train_val_gap:+.2f}%")
print(f"Gap Status: {gap_status}")

# Architecture Summary
total_params = sum(p.numel() for p in model.parameters())
print(f"\nArchitecture Summary:")
print(f"• Total Parameters: {total_params:,}")
print(f"• Trainable Parameters: {total_params:,}")
print(f"• Parameter Constraint: {'✓ PASS' if total_params < 20000 else '✗ FAIL'} (< 20,000)")
print(f"• Total Epochs Trained: {num_epochs}")
print(f"• Epoch Constraint: {'✓ PASS' if num_epochs <= 20 else '✗ FAIL'} (≤ 20)")
print(f"• Best Validation Accuracy: {best_accuracy:.2f}% (epoch {best_epoch})")
print(f"• Final Validation Accuracy: {final_test_acc:.2f}%")
print(f"• Target Achievement: {'✓ ACHIEVED' if final_test_acc >= target_accuracy else '✗ NOT ACHIEVED'} (≥ {target_accuracy}%)")

# Key Architectural Features
print(f"\nKey Architectural Features:")
print(f"✓ Batch Normalization: After each convolution layer")
print(f"✓ Dropout: Strategic placement with rate {model.dropout1.p if hasattr(model, 'dropout1') else 'varies'}")
print(f"✓ Global Average Pooling: Replaces fully connected layers")
print(f"✓ 1x1 Convolutions: Used in transition layers")
print(f"✓ Proper Layer Ordering: Conv -> BN -> ReLU -> Dropout")
print(f"✓ Transition Layers: Dimensionality reduction before pooling")

print("="*60)


In [None]:
# Model Comparison and Quick Testing
print("="*60)
print("MODEL COMPARISON AND QUICK TESTING")
print("="*60)

# Test all models quickly to compare parameters
models_info = {}
for model_name in ["Model1", "Model2", "Model3", "Model4", "Model5"]:
    test_model = get_model(model_name)
    params = sum(p.numel() for p in test_model.parameters())
    models_info[model_name] = {
        'params': params,
        'under_20k': params < 20000
    }

print("Parameter Comparison:")
for model_name, info in models_info.items():
    status = "✓ PASS" if info['under_20k'] else "✗ FAIL"
    print(f"  {model_name}: {info['params']:,} parameters ({status})")

print(f"\nModel Descriptions:")
print(f"  Model1: Baseline (17,442 params) - Adam - WORKING")
print(f"  Model2: Improved Architecture (16,480 params) - Adam - WINNER")
print(f"  Model3: SGD Optimized (18,440 params) - SGD")
print(f"  Model4: Model1 + SGD (17,442 params) - SGD - Tests SGD with minimal arch")
print(f"  Model5: Model2 + SGD (16,480 params) - SGD - POTENTIAL ULTIMATE WINNER")

print(f"\nTo test a different model, change MODEL_NAME in cell 1:")
print(f"  MODEL_NAME = 'Model1'  # Baseline with Adam")
print(f"  MODEL_NAME = 'Model2'  # Current winner with Adam")
print(f"  MODEL_NAME = 'Model3'  # SGD with balanced architecture")
print(f"  MODEL_NAME = 'Model4'  # SGD with minimal architecture")
print(f"  MODEL_NAME = 'Model5'  # SGD with optimal architecture - RECOMMENDED")

print(f"\nCurrent Model: {MODEL_NAME}")
print("="*60)
