# RNN Regularization: Preventing Overfitting

In this notebook, we'll explore four powerful regularization techniques:
1. **Dropout** - Randomly disable neurons
2. **Layer Normalization** - Stabilize training
3. **Weight Decay** - Penalize large weights
4. **Early Stopping** - Stop before overfitting

Let's see them in action!

## Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from implementation import (
    dropout_forward, dropout_backward,
    layer_norm_forward, layer_norm_backward,
    RegularizedLSTM, EarlyStoppingMonitor,
    compute_l2_penalty, regularized_loss
)
from visualization import (
    plot_learning_curves,
    plot_regularization_comparison,
    plot_weight_distributions
)
from train_minimal import generate_simple_dataset

np.random.seed(42)
print("âœ“ All modules loaded!")

## Part 1: Understanding Overfitting

### The Problem

A model can memorize training data without learning generalizable patterns.

In [None]:
# Simulate overfitting scenario
epochs = range(30)

# Without regularization: training loss decreases, validation loss increases
train_loss_overfit = 2.0 / (1 + np.array(epochs) * 0.1) + np.random.normal(0, 0.05, len(epochs))
val_loss_overfit = 2.0 / (1 + np.array(epochs) * 0.05) + np.random.normal(0, 0.08, len(epochs))
val_loss_overfit[15:] = val_loss_overfit[15:] + np.array(epochs[15:]) * 0.02

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(epochs, train_loss_overfit, 'b-o', label='Training Loss', linewidth=2)
plt.plot(epochs, val_loss_overfit, 'r-s', label='Validation Loss', linewidth=2)
plt.axvline(x=15, color='green', linestyle='--', alpha=0.7, label='Overfitting starts')
plt.fill_between(epochs[15:], train_loss_overfit[15:], val_loss_overfit[15:], alpha=0.2, color='red')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('WITHOUT Regularization\n(Overfitting Gap Grows)')
plt.legend()
plt.grid(True, alpha=0.3)

# With regularization: both decrease together
train_loss_regular = 2.0 / (1 + np.array(epochs) * 0.08) + np.random.normal(0, 0.04, len(epochs))
val_loss_regular = 2.0 / (1 + np.array(epochs) * 0.07) + np.random.normal(0, 0.05, len(epochs))

plt.subplot(1, 2, 2)
plt.plot(epochs, train_loss_regular, 'b-o', label='Training Loss', linewidth=2)
plt.plot(epochs, val_loss_regular, 'r-s', label='Validation Loss', linewidth=2)
plt.fill_between(epochs, train_loss_regular, val_loss_regular, alpha=0.2, color='green')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('WITH Regularization\n(Controlled, Stable)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Left: Without regularization - BIG GAP = overfitting")
print("Right: With regularization - SMALL GAP = good generalization")

## Part 2: Dropout - Random Deactivation

In [None]:
# Create some activations
activations = np.random.randn(10)

print("Original activations:")
print(activations.round(2))

# Apply dropout during training
output_train, mask = dropout_forward(activations, keep_prob=0.8, training=True)
print(f"\nWith dropout (keep_prob=0.8):")
print(output_train.round(2))
print(f"Mask: {mask.round(2)}")
print(f"Notice: Some values are 0, others are scaled up")

# No dropout during testing
output_test, _ = dropout_forward(activations, keep_prob=0.8, training=False)
print(f"\nDuring testing (training=False):")
print(output_test.round(2))
print(f"All values pass through unchanged!")

In [None]:
# Visualize dropout effect
activations_matrix = np.random.randn(20, 50)

plt.figure(figsize=(14, 5))

# Without dropout
plt.subplot(1, 2, 1)
plt.imshow(activations_matrix, cmap='RdYlGn', aspect='auto')
plt.colorbar(label='Activation value')
plt.title('WITHOUT Dropout\n(All neurons active)', fontsize=12, fontweight='bold')
plt.xlabel('Hidden units')
plt.ylabel('Time steps')

# With dropout
dropped_matrix = np.zeros_like(activations_matrix)
for i in range(activations_matrix.shape[0]):
    dropped_matrix[i], _ = dropout_forward(activations_matrix[i], keep_prob=0.7, training=True)

plt.subplot(1, 2, 2)
plt.imshow(dropped_matrix, cmap='RdYlGn', aspect='auto')
plt.colorbar(label='Activation value')
plt.title('WITH Dropout (keep_prob=0.7)\n(30% randomly deactivated)', fontsize=12, fontweight='bold')
plt.xlabel('Hidden units')
plt.ylabel('Time steps')

plt.tight_layout()
plt.show()

print(f"Black areas = deactivated neurons")
print(f"Dropout forces network to learn from different subsets of neurons")
print(f"This prevents co-adaptation and improves generalization")

## Part 3: Layer Normalization - Stabilizing Training

In [None]:
# Generate activations from untrained network
batch_size = 32
hidden_size = 64

# Random activations (before training)
activations_raw = np.random.randn(batch_size, hidden_size) * 10  # Large variance

print("Raw activations statistics:")
print(f"  Mean: {activations_raw.mean(axis=1).mean():.4f}")
print(f"  Std: {activations_raw.std(axis=1).mean():.4f}")
print(f"  Min: {activations_raw.min():.4f}")
print(f"  Max: {activations_raw.max():.4f}")

# Apply layer normalization
gamma = np.ones((hidden_size,))
beta = np.zeros((hidden_size,))

activations_norm, cache = layer_norm_forward(activations_raw, gamma, beta)

print("\nAfter layer normalization:")
print(f"  Mean: {activations_norm.mean(axis=1).mean():.4f}")
print(f"  Std: {activations_norm.std(axis=1).mean():.4f}")
print(f"  Min: {activations_norm.min():.4f}")
print(f"  Max: {activations_norm.max():.4f}")
print("\nâœ“ Much more stable! Mean â‰ˆ 0, Std â‰ˆ 1")

In [None]:
# Visualize layer norm effect
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(activations_raw.flatten(), bins=50, alpha=0.7, color='blue', edgecolor='black')
axes[0].set_title('Raw Activations\n(Unstable: large range)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Activation value')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)
axes[0].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Zero')
axes[0].legend()

axes[1].hist(activations_norm.flatten(), bins=50, alpha=0.7, color='green', edgecolor='black')
axes[1].set_title('After Layer Norm\n(Stable: centered at 0, std=1)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Activation value')
axes[1].set_ylabel('Frequency')
axes[1].grid(True, alpha=0.3)
axes[1].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Zero')
axes[1].legend()
axes[1].set_xlim([-4, 4])

plt.tight_layout()
plt.show()

print("Layer Normalization benefits:")
print("  1. Prevents gradient explosion (gradients stay in reasonable range)")
print("  2. Speeds up convergence (stable gradient flow)")
print("  3. Allows higher learning rates")
print("  4. Reduces dependence on weight initialization")

## Part 4: Weight Decay - Pulling Weights to Zero

In [None]:
# Simulate weight evolution
np.random.seed(42)
initial_weights = np.random.randn(1000) * 0.01

# Training without weight decay: weights grow large
weights_no_decay = initial_weights.copy()
for epoch in range(50):
    weights_no_decay += np.random.randn(1000) * 0.05  # Gradient updates

# Training with weight decay: weights stay small
weights_with_decay = initial_weights.copy()
weight_decay_coeff = 0.01
for epoch in range(50):
    weights_with_decay += np.random.randn(1000) * 0.05  # Gradient updates
    weights_with_decay -= weight_decay_coeff * weights_with_decay  # L2 penalty pulls toward zero

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(initial_weights, bins=50, alpha=0.7, color='blue', edgecolor='black')
axes[0].set_title('Initial Weights\n(Small random)', fontsize=11, fontweight='bold')
axes[0].set_xlabel('Weight value')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)
axes[0].axvline(x=0, color='red', linestyle='--', linewidth=2)

axes[1].hist(weights_no_decay, bins=50, alpha=0.7, color='orange', edgecolor='black')
axes[1].set_title('After Training\n(NO Weight Decay - Large spread)', fontsize=11, fontweight='bold')
axes[1].set_xlabel('Weight value')
axes[1].set_ylabel('Frequency')
axes[1].grid(True, alpha=0.3)
axes[1].axvline(x=0, color='red', linestyle='--', linewidth=2)

axes[2].hist(weights_with_decay, bins=50, alpha=0.7, color='green', edgecolor='black')
axes[2].set_title('After Training\n(WITH Weight Decay - Concentrated near 0)', fontsize=11, fontweight='bold')
axes[2].set_xlabel('Weight value')
axes[2].set_ylabel('Frequency')
axes[2].grid(True, alpha=0.3)
axes[2].axvline(x=0, color='red', linestyle='--', linewidth=2)

plt.tight_layout()
plt.show()

print("Weight Decay Statistics:")
print(f"Without decay: mean={weights_no_decay.mean():.4f}, std={weights_no_decay.std():.4f}")
print(f"With decay:    mean={weights_with_decay.mean():.4f}, std={weights_with_decay.std():.4f}")
print("\nWeight decay encourages simple models (Occam's Razor)")

In [None]:
# Show weight decay formula
print("Weight Decay (L2 Regularization) Formula:")
print()
print("  Loss_total = Loss_model + Î» * (1/2) * Î£(wÂ²)")
print()
print("Where:")
print(f"  Loss_model = cross-entropy or other model loss")
print(f"  Î» (lambda) = weight decay coefficient (e.g., 0.0001)")
print(f"  Î£(wÂ²) = sum of squared weights")
print()
print("Typical values:")
print(f"  Light:   Î» = 0.00001 (very gentle)")
print(f"  Normal:  Î» = 0.0001  (standard)")
print(f"  Strong:  Î» = 0.001   (heavy regularization)")

# Example
print("\nExample:")
model_loss = 0.5
weights_example = [np.random.randn(10), np.random.randn(20)]
weight_decay = 0.0001

l2_penalty = compute_l2_penalty(weights_example, weight_decay)
total_loss = regularized_loss(model_loss, weights_example, weight_decay)

print(f"  Model loss: {model_loss:.4f}")
print(f"  L2 penalty: {l2_penalty:.6f}")
print(f"  Total loss: {total_loss:.4f}")

## Part 5: Early Stopping - Know When to Stop

In [None]:
# Simulate training with early stopping
epochs_data = [
    (0, 2.50, 2.51),
    (1, 2.00, 2.10),
    (2, 1.50, 1.80),
    (3, 1.00, 1.70),
    (4, 0.70, 1.75),  # Validation stops improving
    (5, 0.50, 1.85),
    (6, 0.35, 2.00),
    (7, 0.25, 2.15),  # Would stop here with patience=3
]

epochs, train_losses, val_losses = zip(*epochs_data)

monitor = EarlyStoppingMonitor(patience=3, verbose=False)

plt.figure(figsize=(10, 6))
plt.plot(epochs, train_losses, 'b-o', linewidth=2.5, markersize=8, label='Training Loss')
plt.plot(epochs, val_losses, 'r-s', linewidth=2.5, markersize=8, label='Validation Loss')

# Mark best epoch
best_epoch = np.argmin(val_losses)
plt.scatter([best_epoch], [val_losses[best_epoch]], color='green', s=300, marker='*', 
           zorder=5, label=f'Best (epoch {best_epoch})')

# Mark early stopping
stop_epoch = 7
plt.axvline(x=stop_epoch, color='orange', linestyle='--', linewidth=2.5, 
            label=f'Early Stop (patience=3, epoch {stop_epoch})')

# Shade regions
plt.axvspan(-0.5, best_epoch + 0.5, alpha=0.1, color='green', label='Improving')
plt.axvspan(best_epoch + 0.5, stop_epoch + 0.5, alpha=0.1, color='red', label='No improvement')

plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Early Stopping: Stop Before Overfitting', fontsize=14, fontweight='bold')
plt.legend(fontsize=11, loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Early Stopping Rules:")
print(f"1. Best validation loss: epoch {best_epoch} (loss={val_losses[best_epoch]:.2f})")
print(f"2. Validation stopped improving after epoch {best_epoch}")
print(f"3. With patience=3, we wait 3 epochs with no improvement")
print(f"4. Stop at epoch {stop_epoch} to save best model state")
print(f"\nBenefit: Saved {7-best_epoch} epochs of training + get best model! ðŸŽ¯")

## Part 6: Complete Example - Training with All Techniques

In [None]:
print("Creating regularized LSTM model...")

# Model
model = RegularizedLSTM(
    vocab_size=50,
    hidden_size=32,
    output_size=50,
    dropout_keep_prob=0.8,
    use_layer_norm=True
)

print(f"Model created: {model.parameter_count():,} parameters")
print(f"  - Dropout: keep_prob=0.8")
print(f"  - Layer Normalization: enabled")
print(f"  - Weight Decay: will use Î»=0.0001")
print(f"  - Early Stopping: patience=5")

In [None]:
print("Generating training data...")

train_data, train_targets = generate_simple_dataset(
    seq_length=15,
    vocab_size=50,
    num_samples=200
)

val_data, val_targets = generate_simple_dataset(
    seq_length=15,
    vocab_size=50,
    num_samples=50
)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Sequence length: 15")
print(f"Vocabulary size: 50")

In [None]:
print("Training with all regularization techniques...\n")

# Configuration
config_dict = {
    'dropout_keep_prob': 0.8,
    'weight_decay': 0.0001,
    'patience': 5
}

early_stop = EarlyStoppingMonitor(patience=config_dict['patience'], verbose=False)

train_losses = []
val_losses = []

h_state = np.zeros((model.hidden_size, 1))
c_state = np.zeros((model.hidden_size, 1))

print(f"{'Epoch':<6} {'Train Loss':<12} {'Val Loss':<12} {'Status':<15}")
print("-" * 50)

for epoch in range(15):
    # Training
    train_loss_total = 0
    for x, y in zip(train_data, train_targets):
        loss, h_state, c_state = model.forward(x, y, h_state, c_state, training=True)
        weights = [model.Wf, model.Wi, model.Wc, model.Wo, model.Why]
        train_loss_total += regularized_loss(loss, weights, config_dict['weight_decay'])
    
    train_loss_avg = train_loss_total / len(train_data)
    train_losses.append(train_loss_avg)
    
    # Validation
    val_loss_total = 0
    h_val = np.zeros((model.hidden_size, 1))
    c_val = np.zeros((model.hidden_size, 1))
    for x, y in zip(val_data, val_targets):
        loss, h_val, c_val = model.forward(x, y, h_val, c_val, training=False)
        weights = [model.Wf, model.Wi, model.Wc, model.Wo, model.Why]
        val_loss_total += regularized_loss(loss, weights, config_dict['weight_decay'])
    
    val_loss_avg = val_loss_total / len(val_data)
    val_losses.append(val_loss_avg)
    
    # Check early stopping
    should_continue = early_stop.check(val_loss_avg, epoch)
    
    if val_loss_avg < early_stop.best_loss:
        status = "âœ“ IMPROVED"
    else:
        status = f"âœ— No improve"
    
    print(f"{epoch:<6} {train_loss_avg:<12.4f} {val_loss_avg:<12.4f} {status:<15}")
    
    if not should_continue:
        print(f"\nEarly stopping at epoch {epoch}!")
        break

print(f"\nâœ“ Training complete!")
print(f"  Best validation loss: {early_stop.best_loss:.4f} (epoch {early_stop.best_epoch})")
print(f"  Final train loss: {train_losses[-1]:.4f}")
print(f"  Final val loss: {val_losses[-1]:.4f}")

In [None]:
# Plot results
plot_learning_curves(train_losses, val_losses, 
                     title='Learning Curves with All Regularization Techniques')
plt.show()

print("\nKey observations:")
print("1. Training loss decreases steadily")
print("2. Validation loss decreases and then stabilizes")
print("3. Gap between train and val stays small (no overfitting!)")
print("4. Early stopping prevents wasted training")

## Summary: When to Use Each Technique

| Technique | When to use | Typical value | Cost |
|-----------|-----------|---------------|------|
| **Dropout** | Large networks, high variance | keep_prob=0.8 | Medium (slower training) |
| **Layer Norm** | Deep RNNs, unstable training | Always on | Low (tiny computation) |
| **Weight Decay** | All models, overfitting | Î»=0.0001 | None (tiny overhead) |
| **Early Stopping** | All models, prevent waste | patience=5 | Negative! (saves time) |

**Pro Tip:** Use all four together for best results! ðŸš€

In [None]:
print("ðŸŽ“ You've learned:")
print("  âœ“ What overfitting is and why it happens")
print("  âœ“ How dropout prevents co-adaptation")
print("  âœ“ How layer norm stabilizes training")
print("  âœ“ How weight decay encourages simplicity")
print("  âœ“ How early stopping prevents memorization")
print("\nðŸ“š Next: Check out the exercises to practice!")