# Lab 1.3.3 Solutions: Loss Landscape Visualization

This notebook contains solutions for the exercises in the Loss Landscape Visualization lab.

---

> **üìù Note:** This solution notebook is designed to be self-contained and includes all necessary
> helper functions. However, if you prefer to use your own implementations from the main notebook,
> ensure those cells have been run first.
>
> You can also import production-ready implementations from the scripts:
> ```python
> from scripts.math_utils import sigmoid, relu, Adam, SGD
> from scripts.visualization_utils import plot_loss_landscape, plot_training_curve
> ```

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)

print("Loss Landscape Solutions")
print("=" * 50)

## Setup: Recreate the Dataset and Network

First, we'll recreate the two moons dataset and the simple network from the main notebook.

In [None]:
# Create two moons dataset
def create_moons_dataset(n_samples=200, noise=0.1):
    """Create a two-moons classification dataset"""
    n_samples_per_class = n_samples // 2
    
    theta1 = np.linspace(0, np.pi, n_samples_per_class)
    X1 = np.column_stack([np.cos(theta1), np.sin(theta1)])
    
    theta2 = np.linspace(0, np.pi, n_samples_per_class)
    X2 = np.column_stack([1 - np.cos(theta2), 1 - np.sin(theta2) - 0.5])
    
    X = np.vstack([X1, X2]) + np.random.randn(n_samples, 2) * noise
    y = np.array([0] * n_samples_per_class + [1] * n_samples_per_class)
    
    return X, y

# Generate data
np.random.seed(42)
X_np, y_np = create_moons_dataset(200, noise=0.15)
X_data = torch.FloatTensor(X_np)
y_data = torch.FloatTensor(y_np).unsqueeze(1)

print(f"Dataset: {X_np.shape[0]} samples")

In [None]:
# Define the network
class SimpleNet(nn.Module):
    """A tiny network for loss landscape visualization"""
    
    def __init__(self, hidden_size=4):
        super().__init__()
        self.fc1 = nn.Linear(2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

# Helper functions
def get_params_as_vector(model):
    """Flatten all model parameters into a single vector"""
    return torch.cat([p.data.view(-1) for p in model.parameters()])

def set_params_from_vector(model, params_vector):
    """Set model parameters from a flattened vector"""
    idx = 0
    for p in model.parameters():
        numel = p.numel()
        p.data = params_vector[idx:idx+numel].view(p.shape)
        idx += numel

def compute_loss(model, X, y, criterion):
    """Compute loss without gradient tracking"""
    with torch.no_grad():
        outputs = model(X)
        return criterion(outputs, y).item()

print("Network and helper functions defined!")

---

## Exercise Solution: Compare Optimizers on the Same Landscape

### üßí ELI5: What We're Doing

> **Imagine three hikers trying to reach the bottom of a valley...**
>
> - **SGD Hiker:** Takes small careful steps, directly downhill
> - **Momentum Hiker:** Builds up speed as they go, like a ball rolling downhill
> - **Adam Hiker:** Uses a smart compass that adapts based on terrain
>
> We'll start all three at the SAME spot and watch their paths!

### The Task

1. Train 3 models with SGD, SGD+Momentum, and Adam
2. Record their parameter trajectories
3. Project to 2D using PCA
4. Visualize on the same loss landscape

In [None]:
# Step 1: Define optimizers to compare
optimizers_to_compare = [
    ('SGD', torch.optim.SGD, {'lr': 0.5}),
    ('SGD + Momentum', torch.optim.SGD, {'lr': 0.5, 'momentum': 0.9}),
    ('Adam', torch.optim.Adam, {'lr': 0.05}),
]

print("Optimizers to compare:")
for name, _, kwargs in optimizers_to_compare:
    print(f"  - {name}: {kwargs}")

In [None]:
# Step 2: Train each optimizer and record trajectory

def train_and_record(model, optimizer, X_data, y_data, n_epochs=200):
    """
    Train model and record parameter trajectory.
    
    Returns:
        trajectory: List of parameter vectors at each step
        losses: List of loss values
    """
    criterion = nn.BCELoss()
    
    trajectory = [get_params_as_vector(model).clone()]
    losses = [compute_loss(model, X_data, y_data, criterion)]
    
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        outputs = model(X_data)
        loss = criterion(outputs, y_data)
        loss.backward()
        optimizer.step()
        
        trajectory.append(get_params_as_vector(model).clone())
        losses.append(loss.item())
    
    return trajectory, losses

# Train all optimizers from the SAME starting point
trajectories = {}
all_losses = {}

for name, opt_class, opt_kwargs in optimizers_to_compare:
    # Use same seed for identical initialization!
    torch.manual_seed(42)
    model = SimpleNet(hidden_size=8)
    optimizer = opt_class(model.parameters(), **opt_kwargs)
    
    traj, losses = train_and_record(model, optimizer, X_data, y_data, n_epochs=200)
    trajectories[name] = torch.stack(traj).numpy()
    all_losses[name] = losses
    
    print(f"{name}: Loss {losses[0]:.4f} ‚Üí {losses[-1]:.4f}")

In [None]:
# Step 3: Project all trajectories to 2D using PCA

# Combine all trajectory points for fitting PCA
all_points = np.vstack([traj for traj in trajectories.values()])

# Fit PCA
pca = PCA(n_components=2)
pca.fit(all_points)

# Transform each trajectory
trajectories_2d = {}
for name, traj in trajectories.items():
    trajectories_2d[name] = pca.transform(traj)

print(f"Variance explained by 2 PCs: {pca.explained_variance_ratio_.sum()*100:.1f}%")

In [None]:
# Step 4: Create loss landscape along PCA directions

# Use the final point of Adam as reference (usually best)
center = trajectories['Adam'][-1]
pc1 = pca.components_[0]
pc2 = pca.components_[1]

# Determine range from all trajectories
all_2d = np.vstack(list(trajectories_2d.values()))
margin = 0.5
x_min, x_max = all_2d[:, 0].min() - margin, all_2d[:, 0].max() + margin
y_min, y_max = all_2d[:, 1].min() - margin, all_2d[:, 1].max() + margin

# Create grid
x_range = np.linspace(x_min, x_max, 50)
y_range = np.linspace(y_min, y_max, 50)

# Compute loss surface
print("Computing loss surface...")
model_temp = SimpleNet(hidden_size=8)
criterion = nn.BCELoss()
loss_surface = np.zeros((len(y_range), len(x_range)))

for i, y_val in enumerate(y_range):
    for j, x_val in enumerate(x_range):
        # Reconstruct parameters in original space
        params = center + x_val * pc1 + y_val * pc2
        set_params_from_vector(model_temp, torch.FloatTensor(params))
        loss_surface[i, j] = compute_loss(model_temp, X_data, y_data, criterion)

print(f"Loss surface computed! Range: [{loss_surface.min():.3f}, {loss_surface.max():.3f}]")

In [None]:
# Step 5: Visualize all trajectories on the landscape

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Colors for each optimizer
colors = {
    'SGD': 'red',
    'SGD + Momentum': 'blue',
    'Adam': 'green'
}

# Left plot: Trajectories on contour
X_grid, Y_grid = np.meshgrid(x_range, y_range)
contour = axes[0].contourf(X_grid, Y_grid, loss_surface, levels=50, cmap='viridis')
axes[0].contour(X_grid, Y_grid, loss_surface, levels=15, colors='white', alpha=0.3, linewidths=0.5)

for name, traj_2d in trajectories_2d.items():
    color = colors[name]
    
    # Plot trajectory line (subsample for clarity)
    step = max(1, len(traj_2d) // 50)
    axes[0].plot(traj_2d[::step, 0], traj_2d[::step, 1], '-', 
                color=color, linewidth=2, alpha=0.8, label=name)
    
    # Start point (circle)
    axes[0].scatter(traj_2d[0, 0], traj_2d[0, 1], color=color, s=150, 
                   marker='o', edgecolors='white', linewidth=2, zorder=5)
    
    # End point (star)
    axes[0].scatter(traj_2d[-1, 0], traj_2d[-1, 1], color=color, s=200, 
                   marker='*', edgecolors='black', linewidth=1, zorder=5)

axes[0].set_xlabel('PC1', fontsize=12)
axes[0].set_ylabel('PC2', fontsize=12)
axes[0].set_title('Optimization Trajectories on Loss Landscape\n(‚óã = Start, ‚òÖ = End)', fontsize=14)
axes[0].legend(loc='upper right')
plt.colorbar(contour, ax=axes[0], label='Loss')

# Right plot: Loss curves
for name, losses in all_losses.items():
    color = colors[name]
    axes[1].semilogy(losses, color=color, linewidth=2, label=name, alpha=0.8)

axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Loss (log scale)', fontsize=12)
axes[1].set_title('Convergence Comparison', fontsize=14)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### üîç Analysis of the Trajectories

Let's analyze what we observe from each optimizer:

In [None]:
# Compute statistics for each optimizer
print("Optimizer Comparison Summary")
print("=" * 60)
print(f"{'Optimizer':<20} {'Initial Loss':<15} {'Final Loss':<15} {'Improvement':<15}")
print("-" * 60)

for name, losses in all_losses.items():
    initial = losses[0]
    final = losses[-1]
    improvement = (initial - final) / initial * 100
    print(f"{name:<20} {initial:<15.4f} {final:<15.4f} {improvement:<14.1f}%")

print("\n" + "=" * 60)
print("\nüìä Key Observations:")
print("  1. All optimizers start at the SAME point (same seed)")
print("  2. SGD: Takes a direct but slow path")
print("  3. Momentum: Builds speed, may overshoot but recovers")
print("  4. Adam: Adaptive steps, often reaches good minimum fastest")

In [None]:
# Compute path lengths (how far each optimizer traveled)
print("\nPath Analysis:")
print("=" * 50)

for name, traj in trajectories.items():
    # Compute total path length in parameter space
    diffs = np.diff(traj, axis=0)
    path_length = np.sum(np.linalg.norm(diffs, axis=1))
    
    # Compute direct distance (start to end)
    direct_dist = np.linalg.norm(traj[-1] - traj[0])
    
    # Efficiency (how direct was the path)
    efficiency = direct_dist / path_length * 100
    
    print(f"\n{name}:")
    print(f"  Path length:    {path_length:.2f}")
    print(f"  Direct distance: {direct_dist:.2f}")
    print(f"  Efficiency:      {efficiency:.1f}% (100% = perfectly direct)")

---

## Bonus: 3D Visualization with Trajectories

In [None]:
# Create a beautiful 3D visualization
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')

# Plot the loss surface
X_grid, Y_grid = np.meshgrid(x_range, y_range)
surf = ax.plot_surface(X_grid, Y_grid, loss_surface, cmap='viridis', 
                       alpha=0.6, linewidth=0)

# Plot each trajectory in 3D
for name in ['SGD', 'SGD + Momentum', 'Adam']:
    traj_2d = trajectories_2d[name]
    losses = all_losses[name]
    color = colors[name]
    
    # Subsample for clarity
    step = max(1, len(traj_2d) // 50)
    
    ax.plot(traj_2d[::step, 0], traj_2d[::step, 1], losses[::step], 
           '-', color=color, linewidth=2.5, label=name, alpha=0.9)
    
    # Start and end markers
    ax.scatter([traj_2d[0, 0]], [traj_2d[0, 1]], [losses[0]], 
              color=color, s=100, marker='o')
    ax.scatter([traj_2d[-1, 0]], [traj_2d[-1, 1]], [losses[-1]], 
              color=color, s=150, marker='*')

ax.set_xlabel('PC1', fontsize=12)
ax.set_ylabel('PC2', fontsize=12)
ax.set_zlabel('Loss', fontsize=12)
ax.set_title('3D Optimization Trajectories', fontsize=14)
ax.legend(loc='upper right')

# Set viewing angle
ax.view_init(elev=25, azim=45)

plt.tight_layout()
plt.show()

---

## Bonus 2: Comparing Learning Rate Sensitivity

In [None]:
# Compare different learning rates for SGD
learning_rates = [0.1, 0.5, 1.0, 2.0]

lr_trajectories = {}
lr_losses = {}

for lr in learning_rates:
    torch.manual_seed(42)
    model = SimpleNet(hidden_size=8)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    traj, losses = train_and_record(model, optimizer, X_data, y_data, n_epochs=100)
    lr_trajectories[f'LR={lr}'] = torch.stack(traj).numpy()
    lr_losses[f'LR={lr}'] = losses

# Project to same PCA space
lr_trajectories_2d = {}
for name, traj in lr_trajectories.items():
    lr_trajectories_2d[name] = pca.transform(traj)

In [None]:
# Visualize learning rate comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

lr_colors = plt.cm.coolwarm(np.linspace(0.2, 0.8, len(learning_rates)))

# Left: Trajectories
contour = axes[0].contourf(X_grid, Y_grid, loss_surface, levels=50, cmap='viridis')
axes[0].contour(X_grid, Y_grid, loss_surface, levels=15, colors='white', alpha=0.3, linewidths=0.5)

for (name, traj_2d), color in zip(lr_trajectories_2d.items(), lr_colors):
    step = max(1, len(traj_2d) // 30)
    axes[0].plot(traj_2d[::step, 0], traj_2d[::step, 1], 'o-', 
                color=color, linewidth=2, markersize=3, alpha=0.8, label=name)
    axes[0].scatter(traj_2d[0, 0], traj_2d[0, 1], color=color, s=100, marker='o', zorder=5)
    axes[0].scatter(traj_2d[-1, 0], traj_2d[-1, 1], color=color, s=150, marker='*', zorder=5)

axes[0].set_xlabel('PC1', fontsize=12)
axes[0].set_ylabel('PC2', fontsize=12)
axes[0].set_title('SGD with Different Learning Rates', fontsize=14)
axes[0].legend(loc='upper right')
plt.colorbar(contour, ax=axes[0], label='Loss')

# Right: Loss curves
for (name, losses), color in zip(lr_losses.items(), lr_colors):
    axes[1].semilogy(losses, color=color, linewidth=2, label=name, alpha=0.8)

axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Loss (log scale)', fontsize=12)
axes[1].set_title('Learning Rate Impact on Convergence', fontsize=14)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nüìä Learning Rate Observations:")
print("  - Small LR (0.1): Slow but stable")
print("  - Medium LR (0.5): Good balance")
print("  - Large LR (1.0): Fast but may oscillate")
print("  - Very Large LR (2.0): May diverge or bounce around!")

---

## Key Takeaways

1. **Loss landscapes** give us visual insight into the optimization process

2. **Different optimizers** take different paths:
   - SGD: Steady, predictable, but can be slow
   - Momentum: Faster, but may overshoot
   - Adam: Adaptive, often best for deep learning

3. **Learning rate matters!**
   - Too small: Slow convergence
   - Too large: Oscillation or divergence
   - Just right: Fast and stable

4. **PCA projection** lets us visualize high-dimensional trajectories in 2D

5. **Path efficiency** varies by optimizer - shorter path doesn't always mean faster convergence!

In [None]:
import gc
gc.collect()
print("\n‚úÖ Solution notebook complete!")