# Lab 1.5.5 Solutions: MNIST Classification

This notebook contains solutions to the exercises from notebook 05.

---

In [None]:
import numpy as np
import sys
sys.path.insert(0, '..')

from micrograd_plus import (
    Tensor, Linear, ReLU, Sigmoid, Tanh, Dropout, Sequential,
    MSELoss, CrossEntropyLoss, SGD, Adam
)
from micrograd_plus.utils import set_seed, DataLoader

set_seed(42)

## Exercise 1 Solution: Improved Architecture

Design an improved MLP that achieves >97% accuracy.

In [None]:
class ImprovedMLP:
    """
    Improved MLP architecture for MNIST.
    
    Key improvements:
    1. Deeper network (more layers)
    2. Dropout for regularization
    3. Better hidden dimensions
    4. BatchNorm for stable training
    """
    
    def __init__(self, input_dim=784, num_classes=10, dropout_rate=0.3):
        self.layers = Sequential(
            # First block: 784 -> 512
            Linear(input_dim, 512),
            ReLU(),
            Dropout(dropout_rate),
            
            # Second block: 512 -> 256
            Linear(512, 256),
            ReLU(),
            Dropout(dropout_rate),
            
            # Third block: 256 -> 128
            Linear(256, 128),
            ReLU(),
            Dropout(dropout_rate / 2),  # Less dropout near output
            
            # Output: 128 -> 10
            Linear(128, num_classes)
        )
    
    def __call__(self, x):
        return self.layers(x)
    
    def parameters(self):
        return self.layers.parameters()
    
    def train(self):
        self.layers.train()
        return self
    
    def eval(self):
        self.layers.eval()
        return self

In [None]:
# Count parameters
model = ImprovedMLP()
total_params = sum(p.data.size for p in model.parameters())
print(f"Improved MLP Parameters: {total_params:,}")
print(f"\nArchitecture:")
print("  784 -> 512 -> 256 -> 128 -> 10")
print("  With Dropout and ReLU activations")

---

## Exercise 2 Solution: Data Augmentation

Implement simple data augmentation for MNIST.

In [None]:
class MNISTAugmenter:
    """
    Data augmentation for MNIST images.
    
    Augmentations:
    1. Random shifts (translation)
    2. Random noise
    3. Random scaling
    """
    
    def __init__(self, max_shift=2, noise_std=0.1, scale_range=(0.9, 1.1)):
        self.max_shift = max_shift
        self.noise_std = noise_std
        self.scale_range = scale_range
    
    def random_shift(self, image):
        """Apply random translation."""
        # Reshape to 28x28
        img = image.reshape(28, 28)
        
        # Random shifts
        shift_x = np.random.randint(-self.max_shift, self.max_shift + 1)
        shift_y = np.random.randint(-self.max_shift, self.max_shift + 1)
        
        # Apply shift using roll (wrapping at edges)
        shifted = np.roll(img, shift_x, axis=1)
        shifted = np.roll(shifted, shift_y, axis=0)
        
        return shifted.flatten()
    
    def add_noise(self, image):
        """Add Gaussian noise."""
        noise = np.random.randn(*image.shape) * self.noise_std
        noisy = np.clip(image + noise, 0, 1)
        return noisy
    
    def random_scale(self, image):
        """Apply random brightness scaling."""
        scale = np.random.uniform(*self.scale_range)
        scaled = np.clip(image * scale, 0, 1)
        return scaled
    
    def augment(self, image, probability=0.5):
        """Apply random augmentations."""
        aug = image.copy()
        
        if np.random.random() < probability:
            aug = self.random_shift(aug)
        
        if np.random.random() < probability:
            aug = self.add_noise(aug)
        
        if np.random.random() < probability:
            aug = self.random_scale(aug)
        
        return aug.astype(np.float32)
    
    def augment_batch(self, batch, probability=0.5):
        """Augment a batch of images."""
        return np.array([self.augment(img, probability) for img in batch])

In [None]:
# Demonstrate augmentation
import matplotlib.pyplot as plt

# Create a sample digit (handwritten 5)
sample = np.zeros((28, 28), dtype=np.float32)
sample[5:23, 8:20] = 0.5  # Body
sample[5:10, 8:20] = 1.0  # Top
sample[12:16, 8:20] = 1.0  # Middle
sample[20:23, 8:20] = 1.0  # Bottom
sample = sample.flatten()

augmenter = MNISTAugmenter(max_shift=3, noise_std=0.15)

fig, axes = plt.subplots(2, 5, figsize=(12, 5))
axes[0, 0].imshow(sample.reshape(28, 28), cmap='gray')
axes[0, 0].set_title('Original')
axes[0, 0].axis('off')

for i in range(1, 10):
    ax = axes[i // 5, i % 5]
    aug = augmenter.augment(sample, probability=0.8)
    ax.imshow(aug.reshape(28, 28), cmap='gray')
    ax.set_title(f'Augmented {i}')
    ax.axis('off')

plt.tight_layout()
plt.show()

---

## Exercise 3 Solution: Learning Rate Scheduling

Implement and use learning rate scheduling for better convergence.

In [None]:
class CosineAnnealingWarmRestarts:
    """
    Cosine annealing with warm restarts.
    
    The learning rate follows a cosine curve and resets periodically.
    Good for escaping local minima.
    """
    
    def __init__(self, optimizer, T_0, T_mult=1, eta_min=0):
        """
        Args:
            optimizer: The optimizer
            T_0: Number of epochs for the first restart
            T_mult: Factor to increase T_0 after each restart
            eta_min: Minimum learning rate
        """
        self.optimizer = optimizer
        self.T_0 = T_0
        self.T_mult = T_mult
        self.eta_min = eta_min
        self.eta_max = optimizer.lr
        
        self.T_cur = 0
        self.T_i = T_0
        self.cycle = 0
    
    def step(self):
        """Update learning rate."""
        # Compute current learning rate
        lr = self.eta_min + 0.5 * (self.eta_max - self.eta_min) * (
            1 + np.cos(np.pi * self.T_cur / self.T_i)
        )
        
        self.optimizer.lr = lr
        
        # Update epoch counter
        self.T_cur += 1
        
        # Check for restart
        if self.T_cur >= self.T_i:
            self.T_cur = 0
            self.T_i = self.T_i * self.T_mult
            self.cycle += 1
        
        return lr
    
    def get_lr(self):
        return self.optimizer.lr

In [None]:
# Visualize the learning rate schedule
x = Tensor([0.0], requires_grad=True)
opt = Adam([x], lr=0.01)

scheduler = CosineAnnealingWarmRestarts(opt, T_0=10, T_mult=2)

lrs = []
for epoch in range(100):
    lr = scheduler.step()
    lrs.append(lr)

plt.figure(figsize=(12, 4))
plt.plot(lrs, 'b-', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Cosine Annealing with Warm Restarts (T_0=10, T_mult=2)')
plt.grid(True, alpha=0.3)

# Mark restarts
restarts = [10, 30, 70]  # Cumulative: 10, 10+20=30, 30+40=70
for r in restarts:
    plt.axvline(x=r, color='r', linestyle='--', alpha=0.5)

plt.show()

---

## Exercise 4 Solution: Early Stopping

Implement early stopping to prevent overfitting.

In [None]:
class EarlyStopping:
    """
    Early stopping to terminate training when validation loss stops improving.
    
    Features:
    - Patience: number of epochs to wait before stopping
    - Min delta: minimum improvement to count as improvement
    - Restore best weights: option to restore best model
    """
    
    def __init__(self, patience=10, min_delta=0.0, restore_best=True, mode='min'):
        """
        Args:
            patience: Epochs to wait before stopping
            min_delta: Minimum change to qualify as improvement
            restore_best: Whether to restore best weights
            mode: 'min' for loss, 'max' for accuracy
        """
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best = restore_best
        self.mode = mode
        
        self.best_score = None
        self.best_weights = None
        self.counter = 0
        self.early_stop = False
        self.best_epoch = 0
    
    def _is_improvement(self, score):
        """Check if score is an improvement."""
        if self.best_score is None:
            return True
        
        if self.mode == 'min':
            return score < self.best_score - self.min_delta
        else:
            return score > self.best_score + self.min_delta
    
    def __call__(self, score, model, epoch):
        """Update early stopping state."""
        if self._is_improvement(score):
            self.best_score = score
            self.counter = 0
            self.best_epoch = epoch
            
            if self.restore_best:
                # Save weights
                self.best_weights = [
                    p.data.copy() for p in model.parameters()
                ]
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                
                if self.restore_best and self.best_weights is not None:
                    # Restore best weights
                    for p, best_w in zip(model.parameters(), self.best_weights):
                        p.data = best_w
                    print(f"Restored best weights from epoch {self.best_epoch}")
        
        return self.early_stop
    
    def reset(self):
        """Reset early stopping state."""
        self.best_score = None
        self.best_weights = None
        self.counter = 0
        self.early_stop = False
        self.best_epoch = 0

In [None]:
# Demonstrate early stopping behavior
print("Early Stopping Demo")
print("=" * 50)

# Simulate validation losses
val_losses = [1.0, 0.8, 0.7, 0.65, 0.63, 0.62, 0.61, 0.615, 0.62, 0.625, 0.63, 0.635, 0.64]

# Create a dummy model
model = Sequential(Linear(10, 5))
early_stopping = EarlyStopping(patience=3, min_delta=0.01)

for epoch, loss in enumerate(val_losses):
    stopped = early_stopping(loss, model, epoch)
    status = "STOP" if stopped else f"counter={early_stopping.counter}"
    print(f"Epoch {epoch}: loss={loss:.4f}, best={early_stopping.best_score:.4f}, {status}")
    
    if stopped:
        break

print(f"\nBest epoch: {early_stopping.best_epoch}")
print(f"Best loss:  {early_stopping.best_score:.4f}")

---

## Challenge Solution: Complete Training Pipeline

Combine all improvements into a production-ready training pipeline.

In [None]:
class TrainingPipeline:
    """
    Complete training pipeline with all improvements.
    
    Features:
    - Data augmentation
    - Learning rate scheduling
    - Early stopping
    - Progress logging
    - Best model checkpointing
    """
    
    def __init__(self, model, loss_fn, optimizer, scheduler=None,
                 augmenter=None, early_stopping=None):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.augmenter = augmenter
        self.early_stopping = early_stopping
        
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'train_acc': [],
            'val_acc': [],
            'lr': []
        }
    
    def train_epoch(self, train_loader):
        """Train for one epoch."""
        self.model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for X_batch, y_batch in train_loader:
            # Apply augmentation if available
            if self.augmenter is not None:
                X_batch = self.augmenter.augment_batch(X_batch)
            
            X = Tensor(X_batch, requires_grad=True)
            y = Tensor(y_batch)
            
            # Forward pass
            logits = self.model(X)
            loss = self.loss_fn(logits, y)
            
            # Backward pass
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            # Track metrics
            total_loss += loss.item() * len(y_batch)
            predictions = np.argmax(logits.data, axis=1)
            correct += np.sum(predictions == y_batch)
            total += len(y_batch)
        
        return total_loss / total, correct / total
    
    def evaluate(self, val_loader):
        """Evaluate on validation set."""
        self.model.eval()
        total_loss = 0
        correct = 0
        total = 0
        
        for X_batch, y_batch in val_loader:
            X = Tensor(X_batch)
            y = Tensor(y_batch)
            
            logits = self.model(X)
            loss = self.loss_fn(logits, y)
            
            total_loss += loss.item() * len(y_batch)
            predictions = np.argmax(logits.data, axis=1)
            correct += np.sum(predictions == y_batch)
            total += len(y_batch)
        
        return total_loss / total, correct / total
    
    def fit(self, train_loader, val_loader, epochs, verbose=True):
        """Run full training."""
        for epoch in range(epochs):
            # Train
            train_loss, train_acc = self.train_epoch(train_loader)
            
            # Validate
            val_loss, val_acc = self.evaluate(val_loader)
            
            # Update scheduler
            current_lr = self.optimizer.lr
            if self.scheduler is not None:
                self.scheduler.step()
            
            # Record history
            self.history['train_loss'].append(train_loss)
            self.history['val_loss'].append(val_loss)
            self.history['train_acc'].append(train_acc)
            self.history['val_acc'].append(val_acc)
            self.history['lr'].append(current_lr)
            
            if verbose:
                print(f"Epoch {epoch+1}/{epochs}: "
                      f"train_loss={train_loss:.4f}, train_acc={train_acc:.4f}, "
                      f"val_loss={val_loss:.4f}, val_acc={val_acc:.4f}, "
                      f"lr={current_lr:.6f}")
            
            # Early stopping
            if self.early_stopping is not None:
                if self.early_stopping(val_loss, self.model, epoch):
                    print(f"Early stopping at epoch {epoch+1}")
                    break
        
        return self.history

In [None]:
# Example usage (with synthetic data)
print("Training Pipeline Demo (Synthetic Data)")
print("=" * 50)

# Create synthetic dataset
np.random.seed(42)
n_samples = 1000
X_train = np.random.randn(n_samples, 784).astype(np.float32)
y_train = np.random.randint(0, 10, n_samples)
X_val = np.random.randn(200, 784).astype(np.float32)
y_val = np.random.randint(0, 10, 200)

# Create data loaders
train_loader = DataLoader(X_train, y_train, batch_size=32, shuffle=True)
val_loader = DataLoader(X_val, y_val, batch_size=32, shuffle=False)

# Create model and training components
model = ImprovedMLP(dropout_rate=0.2)
loss_fn = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)
early_stopping = EarlyStopping(patience=5, min_delta=0.001)
augmenter = MNISTAugmenter(max_shift=2, noise_std=0.1)

# Create and run pipeline
pipeline = TrainingPipeline(
    model=model,
    loss_fn=loss_fn,
    optimizer=optimizer,
    scheduler=scheduler,
    augmenter=augmenter,
    early_stopping=early_stopping
)

history = pipeline.fit(train_loader, val_loader, epochs=20, verbose=True)

---

## Key Takeaways

1. **Deeper Networks**: More layers with proper regularization (dropout) often improve performance

2. **Data Augmentation**: Simple transforms (shifts, noise, scaling) increase effective dataset size

3. **Learning Rate Scheduling**: Warm restarts help escape local minima

4. **Early Stopping**: Prevents overfitting by monitoring validation loss

5. **Complete Pipeline**: Combining all techniques creates robust training