# Lab 1.5.3 Solution: Regularization Experiments

This notebook contains solutions to the exercises from Lab 1.5.3.

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
%matplotlib inline

## Exercise 1 Solution: Grid Search for Optimal Regularization

In [None]:
# Load data
import gzip, os, urllib.request

def load_mnist(path='../data'):
    os.makedirs(path, exist_ok=True)
    base_url = 'http://yann.lecun.com/exdb/mnist/'
    files = ['train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz',
             't10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz']
    for f in files:
        fp = os.path.join(path, f)
        if not os.path.exists(fp): urllib.request.urlretrieve(base_url + f, fp)
    def load_img(fp): 
        with gzip.open(fp) as f: f.read(16); return np.frombuffer(f.read(), np.uint8).reshape(-1,784).astype(np.float32)/255
    def load_lbl(fp): 
        with gzip.open(fp) as f: f.read(8); return np.frombuffer(f.read(), np.uint8)
    return (load_img(os.path.join(path, files[0])), load_lbl(os.path.join(path, files[1])),
            load_img(os.path.join(path, files[2])), load_lbl(os.path.join(path, files[3])))

X_train_full, y_train_full, X_test, y_test = load_mnist()

# Use small training set for overfitting scenario
X_train = X_train_full[:1000]
y_train = y_train_full[:1000]

In [None]:
class RegularizedMLP:
    """MLP with L2 regularization and Dropout."""
    
    def __init__(self, layer_sizes, l2_lambda=0.0, dropout_rate=0.0):
        self.l2_lambda = l2_lambda
        self.dropout_rate = dropout_rate
        self.training = True
        self.layers = []
        
        for i in range(len(layer_sizes) - 1):
            W = np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * np.sqrt(2.0 / layer_sizes[i])
            b = np.zeros(layer_sizes[i + 1])
            self.layers.append({'W': W, 'b': b, 'cache': {}, 'mask': None})
    
    def forward(self, X):
        out = X
        for i, layer in enumerate(self.layers[:-1]):
            layer['cache']['X'] = out
            out = out @ layer['W'] + layer['b']
            layer['cache']['Z'] = out
            out = np.maximum(0, out)  # ReLU
            
            # Dropout
            if self.training and self.dropout_rate > 0:
                mask = (np.random.rand(*out.shape) > self.dropout_rate).astype(float)
                out = out * mask / (1 - self.dropout_rate)
                layer['mask'] = mask
        
        self.layers[-1]['cache']['X'] = out
        out = out @ self.layers[-1]['W'] + self.layers[-1]['b']
        out_shifted = out - np.max(out, axis=1, keepdims=True)
        exp_out = np.exp(out_shifted)
        self.probs = exp_out / np.sum(exp_out, axis=1, keepdims=True)
        return self.probs
    
    def compute_loss(self, targets):
        batch_size = len(targets)
        ce_loss = -np.mean(np.log(self.probs[np.arange(batch_size), targets] + 1e-10))
        l2_loss = 0.5 * self.l2_lambda * sum(np.sum(l['W']**2) for l in self.layers) if self.l2_lambda > 0 else 0
        return ce_loss + l2_loss
    
    def backward(self, targets, lr):
        batch_size = len(targets)
        grad = self.probs.copy()
        grad[np.arange(batch_size), targets] -= 1
        
        for i in range(len(self.layers) - 1, -1, -1):
            layer = self.layers[i]
            X = layer['cache']['X']
            
            dW = X.T @ grad / batch_size
            if self.l2_lambda > 0:
                dW += self.l2_lambda * layer['W']
            
            layer['W'] -= lr * dW
            layer['b'] -= lr * np.mean(grad, axis=0)
            
            grad = grad @ layer['W'].T
            
            if i > 0:
                Z = self.layers[i - 1]['cache']['Z']
                grad = grad * (Z > 0)
                if self.layers[i - 1]['mask'] is not None:
                    grad = grad * self.layers[i - 1]['mask'] / (1 - self.dropout_rate)
    
    def predict(self, X):
        self.training = False
        probs = self.forward(X)
        self.training = True
        return np.argmax(probs, axis=1)

In [None]:
# Grid search
print("Grid Search for Optimal Regularization")
print("=" * 70)

l2_values = [0.0, 0.0001, 0.001, 0.01]
dropout_values = [0.0, 0.1, 0.2, 0.3]

results = {}
best_acc = 0
best_config = None

for l2 in l2_values:
    for dropout in dropout_values:
        np.random.seed(42)
        model = RegularizedMLP([784, 512, 256, 10], l2_lambda=l2, dropout_rate=dropout)
        
        # Train
        for epoch in range(30):
            indices = np.random.permutation(len(X_train))
            for start in range(0, len(X_train), 32):
                batch_idx = indices[start:start+32]
                model.forward(X_train[batch_idx])
                model.backward(y_train[batch_idx], 0.1)
        
        # Evaluate
        train_acc = np.mean(model.predict(X_train) == y_train)
        test_acc = np.mean(model.predict(X_test[:2000]) == y_test[:2000])
        gap = train_acc - test_acc
        
        results[(l2, dropout)] = {'train': train_acc, 'test': test_acc, 'gap': gap}
        
        if test_acc > best_acc:
            best_acc = test_acc
            best_config = (l2, dropout)
        
        print(f"L2={l2:.4f}, Dropout={dropout:.1f} | Train: {train_acc:.2%} | Test: {test_acc:.2%} | Gap: {gap:.2%}")

print("=" * 70)
print(f"\n✅ Best configuration: L2={best_config[0]}, Dropout={best_config[1]}")
print(f"   Best test accuracy: {best_acc:.2%}")

In [None]:
# Visualize results as heatmap
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

metrics = ['train', 'test', 'gap']
titles = ['Training Accuracy', 'Test Accuracy', 'Generalization Gap']
cmaps = ['Blues', 'Greens', 'Reds']

for ax, metric, title, cmap in zip(axes, metrics, titles, cmaps):
    data = np.zeros((len(l2_values), len(dropout_values)))
    for i, l2 in enumerate(l2_values):
        for j, drop in enumerate(dropout_values):
            data[i, j] = results[(l2, drop)][metric]
    
    im = ax.imshow(data, cmap=cmap, aspect='auto')
    ax.set_xticks(range(len(dropout_values)))
    ax.set_yticks(range(len(l2_values)))
    ax.set_xticklabels([f'{d:.1f}' for d in dropout_values])
    ax.set_yticklabels([f'{l:.4f}' for l in l2_values])
    ax.set_xlabel('Dropout Rate')
    ax.set_ylabel('L2 Lambda')
    ax.set_title(title)
    
    # Add value annotations
    for i in range(len(l2_values)):
        for j in range(len(dropout_values)):
            ax.text(j, i, f'{data[i,j]:.1%}', ha='center', va='center', fontsize=8)
    
    plt.colorbar(im, ax=ax)

plt.tight_layout()
plt.show()

## Exercise 2 Solution: Early Stopping Implementation

In [None]:
class EarlyStopping:
    """
    Early stopping to prevent overfitting.
    
    Monitors validation loss and stops training when it stops improving.
    """
    
    def __init__(self, patience: int = 5, min_delta: float = 0.0):
        """
        Args:
            patience: Number of epochs to wait for improvement
            min_delta: Minimum change to qualify as improvement
        """
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float('inf')
        self.counter = 0
        self.best_weights = None
    
    def __call__(self, val_loss, model_weights):
        """
        Check if training should stop.
        
        Returns:
            True if training should stop, False otherwise
        """
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            # Deep copy weights
            self.best_weights = [(l['W'].copy(), l['b'].copy()) for l in model_weights]
            return False
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True
            return False
    
    def restore_best_weights(self, model_weights):
        """Restore the best weights found during training."""
        if self.best_weights is not None:
            for layer, (W, b) in zip(model_weights, self.best_weights):
                layer['W'] = W.copy()
                layer['b'] = b.copy()


def train_with_early_stopping(model, X_train, y_train, X_val, y_val, 
                               max_epochs=100, patience=5, batch_size=32, lr=0.1):
    """
    Train model with early stopping.
    
    Returns:
        history: Dictionary with training metrics
        stopped_epoch: Epoch at which training stopped
    """
    early_stopping = EarlyStopping(patience=patience)
    history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
    
    for epoch in range(max_epochs):
        # Training
        model.training = True
        indices = np.random.permutation(len(X_train))
        train_loss = 0
        n_batches = 0
        
        for start in range(0, len(X_train), batch_size):
            batch_idx = indices[start:start+batch_size]
            model.forward(X_train[batch_idx])
            train_loss += model.compute_loss(y_train[batch_idx])
            model.backward(y_train[batch_idx], lr)
            n_batches += 1
        
        train_loss /= n_batches
        
        # Validation
        model.training = False
        model.forward(X_val)
        val_loss = model.compute_loss(y_val)
        
        train_acc = np.mean(model.predict(X_train) == y_train)
        val_acc = np.mean(model.predict(X_val) == y_val)
        
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)
        
        print(f"Epoch {epoch+1:3d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | "
              f"Train Acc: {train_acc:.2%} | Val Acc: {val_acc:.2%}")
        
        # Check early stopping
        if early_stopping(val_loss, model.layers):
            print(f"\n⚡ Early stopping triggered at epoch {epoch+1}!")
            print(f"   Best validation loss: {early_stopping.best_loss:.4f}")
            early_stopping.restore_best_weights(model.layers)
            return history, epoch + 1
    
    return history, max_epochs

In [None]:
# Test early stopping
print("Training with Early Stopping")
print("=" * 70)

np.random.seed(42)
model = RegularizedMLP([784, 512, 256, 10], l2_lambda=0.0, dropout_rate=0.0)  # No regularization to see overfitting

# Split training data for validation
X_train_split = X_train[:800]
y_train_split = y_train[:800]
X_val_split = X_train[800:]
y_val_split = y_train[800:]

history, stopped_epoch = train_with_early_stopping(
    model, X_train_split, y_train_split, X_val_split, y_val_split,
    max_epochs=50, patience=5, batch_size=32, lr=0.1
)

print(f"\n✅ Training stopped at epoch {stopped_epoch}")
print(f"   Final test accuracy: {np.mean(model.predict(X_test) == y_test):.2%}")

In [None]:
# Plot early stopping effect
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

epochs = range(1, len(history['train_loss']) + 1)

axes[0].plot(epochs, history['train_loss'], 'b-', label='Train Loss', linewidth=2)
axes[0].plot(epochs, history['val_loss'], 'r-', label='Val Loss', linewidth=2)
axes[0].axvline(stopped_epoch - 5, color='green', linestyle='--', label='Best Model')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Loss Curves with Early Stopping')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(epochs, history['train_acc'], 'b-', label='Train Acc', linewidth=2)
axes[1].plot(epochs, history['val_acc'], 'r-', label='Val Acc', linewidth=2)
axes[1].axvline(stopped_epoch - 5, color='green', linestyle='--', label='Best Model')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy Curves with Early Stopping')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---

## Key Takeaways

1. **Grid search** helps find optimal hyperparameters systematically
2. **L2 + Dropout together** often works better than either alone
3. **Early stopping** is a simple but effective regularization technique
4. Always **save the best model** during training, not just the final one