# Task 3.1: Manual Backpropagation - SOLUTIONS

This notebook contains complete solutions to all exercises in the manual backpropagation lab.

---

> **📝 Note:** This solution notebook is designed to be self-contained and includes all necessary
> helper functions. However, if you prefer to use your own implementations from the main notebook,
> ensure those cells have been run first.
>
> You can also import production-ready implementations from the scripts:
> ```python
> from scripts.math_utils import sigmoid, relu, Adam, SGD
> from scripts.visualization_utils import plot_loss_landscape, plot_training_curve
> ```

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

np.random.seed(42)
torch.manual_seed(42)

print("Solutions Notebook - Manual Backpropagation")
print("=" * 50)

## Solution 1: ReLU Backward Pass

**Exercise:** Implement backward pass with ReLU activation.

In [None]:
# SOLUTION: ReLU activation and backward pass

def relu(z):
    """ReLU activation: max(0, z)"""
    return np.maximum(0, z)

def relu_derivative(z):
    """ReLU derivative: 1 if z > 0, else 0"""
    return (z > 0).astype(float)

# Single neuron with ReLU
x = 2.0
y = 1.0
w = 0.5
b = 0.1

# Forward pass
z = w * x + b
print(f"z = w*x + b = {w}*{x} + {b} = {z}")

y_hat = relu(z)
print(f"ŷ = ReLU(z) = ReLU({z}) = {y_hat}")

loss = (y_hat - y) ** 2
print(f"Loss = (ŷ - y)² = ({y_hat} - {y})² = {loss}")

# Backward pass
print("\nBackward pass:")

# ∂L/∂ŷ = 2(ŷ - y)
dL_dy_hat = 2 * (y_hat - y)
print(f"∂L/∂ŷ = 2(ŷ - y) = 2({y_hat} - {y}) = {dL_dy_hat}")

# ∂ŷ/∂z = ReLU'(z) = 1 (since z > 0)
dy_hat_dz = relu_derivative(z)
print(f"∂ŷ/∂z = ReLU'(z) = ReLU'({z}) = {dy_hat_dz}")

# ∂z/∂w = x
dz_dw = x
print(f"∂z/∂w = x = {dz_dw}")

# ∂z/∂b = 1
dz_db = 1
print(f"∂z/∂b = 1")

# Chain rule
dL_dw = dL_dy_hat * dy_hat_dz * dz_dw
dL_db = dL_dy_hat * dy_hat_dz * dz_db

print(f"\n∂L/∂w = {dL_dy_hat} × {dy_hat_dz} × {dz_dw} = {dL_dw}")
print(f"∂L/∂b = {dL_dy_hat} × {dy_hat_dz} × {dz_db} = {dL_db}")

In [None]:
# Verify with PyTorch

x_t = torch.tensor(x)
y_t = torch.tensor(y)
w_t = torch.tensor(w, requires_grad=True)
b_t = torch.tensor(b, requires_grad=True)

z_t = w_t * x_t + b_t
y_hat_t = torch.relu(z_t)
loss_t = (y_hat_t - y_t) ** 2
loss_t.backward()

print("Verification with PyTorch:")
print(f"Manual ∂L/∂w:   {dL_dw}")
print(f"PyTorch ∂L/∂w:  {w_t.grad.item()}")
print(f"Manual ∂L/∂b:   {dL_db}")
print(f"PyTorch ∂L/∂b:  {b_t.grad.item()}")
print("\n✅ Gradients match!")

## Solution 2: 4-Layer MLP

**Exercise:** Extend the MLP to 4 layers.

In [None]:
# SOLUTION: 4-layer MLP with manual backprop

class ManualMLP4Layer:
    """
    4-layer MLP: Input(2) → Hidden1(4) → Hidden2(3) → Hidden3(2) → Output(1)
    """
    
    def __init__(self):
        # Xavier initialization
        self.W1 = np.random.randn(2, 4) * np.sqrt(2.0 / 2)
        self.b1 = np.zeros((1, 4))
        
        self.W2 = np.random.randn(4, 3) * np.sqrt(2.0 / 4)
        self.b2 = np.zeros((1, 3))
        
        self.W3 = np.random.randn(3, 2) * np.sqrt(2.0 / 3)
        self.b3 = np.zeros((1, 2))
        
        self.W4 = np.random.randn(2, 1) * np.sqrt(2.0 / 2)
        self.b4 = np.zeros((1, 1))
        
        self.cache = {}
    
    def relu(self, z):
        return np.maximum(0, z)
    
    def relu_derivative(self, z):
        return (z > 0).astype(float)
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
    
    def sigmoid_derivative(self, z):
        s = self.sigmoid(z)
        return s * (1 - s)
    
    def forward(self, X):
        self.cache['X'] = X
        
        # Layer 1
        self.cache['z1'] = X @ self.W1 + self.b1
        self.cache['h1'] = self.relu(self.cache['z1'])
        
        # Layer 2
        self.cache['z2'] = self.cache['h1'] @ self.W2 + self.b2
        self.cache['h2'] = self.relu(self.cache['z2'])
        
        # Layer 3
        self.cache['z3'] = self.cache['h2'] @ self.W3 + self.b3
        self.cache['h3'] = self.relu(self.cache['z3'])
        
        # Layer 4 (output)
        self.cache['z4'] = self.cache['h3'] @ self.W4 + self.b4
        self.cache['y_hat'] = self.sigmoid(self.cache['z4'])
        
        return self.cache['y_hat']
    
    def compute_loss(self, y_hat, y):
        return np.mean((y_hat - y) ** 2)
    
    def backward(self, y):
        batch_size = y.shape[0]
        
        # Layer 4 (output)
        dL_dy_hat = 2 * (self.cache['y_hat'] - y) / batch_size
        dy_hat_dz4 = self.sigmoid_derivative(self.cache['z4'])
        dL_dz4 = dL_dy_hat * dy_hat_dz4
        dL_dW4 = self.cache['h3'].T @ dL_dz4
        dL_db4 = np.sum(dL_dz4, axis=0, keepdims=True)
        
        # Layer 3
        dL_dh3 = dL_dz4 @ self.W4.T
        dh3_dz3 = self.relu_derivative(self.cache['z3'])
        dL_dz3 = dL_dh3 * dh3_dz3
        dL_dW3 = self.cache['h2'].T @ dL_dz3
        dL_db3 = np.sum(dL_dz3, axis=0, keepdims=True)
        
        # Layer 2
        dL_dh2 = dL_dz3 @ self.W3.T
        dh2_dz2 = self.relu_derivative(self.cache['z2'])
        dL_dz2 = dL_dh2 * dh2_dz2
        dL_dW2 = self.cache['h1'].T @ dL_dz2
        dL_db2 = np.sum(dL_dz2, axis=0, keepdims=True)
        
        # Layer 1
        dL_dh1 = dL_dz2 @ self.W2.T
        dh1_dz1 = self.relu_derivative(self.cache['z1'])
        dL_dz1 = dL_dh1 * dh1_dz1
        dL_dW1 = self.cache['X'].T @ dL_dz1
        dL_db1 = np.sum(dL_dz1, axis=0, keepdims=True)
        
        return {
            'dW1': dL_dW1, 'db1': dL_db1,
            'dW2': dL_dW2, 'db2': dL_db2,
            'dW3': dL_dW3, 'db3': dL_db3,
            'dW4': dL_dW4, 'db4': dL_db4
        }

# Test on XOR
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float64)
y = np.array([[0], [1], [1], [0]], dtype=np.float64)

np.random.seed(42)
mlp4 = ManualMLP4Layer()

y_hat = mlp4.forward(X)
print(f"Initial predictions: {y_hat.flatten().round(4)}")
print(f"Initial loss: {mlp4.compute_loss(y_hat, y):.6f}")

In [None]:
# Verify 4-layer MLP gradients with PyTorch

class PyTorchMLP4(nn.Module):
    def __init__(self, W1, b1, W2, b2, W3, b3, W4, b4):
        super().__init__()
        self.fc1 = nn.Linear(2, 4)
        self.fc2 = nn.Linear(4, 3)
        self.fc3 = nn.Linear(3, 2)
        self.fc4 = nn.Linear(2, 1)
        
        # Copy weights
        self.fc1.weight.data = torch.tensor(W1.T, dtype=torch.float64)
        self.fc1.bias.data = torch.tensor(b1.flatten(), dtype=torch.float64)
        self.fc2.weight.data = torch.tensor(W2.T, dtype=torch.float64)
        self.fc2.bias.data = torch.tensor(b2.flatten(), dtype=torch.float64)
        self.fc3.weight.data = torch.tensor(W3.T, dtype=torch.float64)
        self.fc3.bias.data = torch.tensor(b3.flatten(), dtype=torch.float64)
        self.fc4.weight.data = torch.tensor(W4.T, dtype=torch.float64)
        self.fc4.bias.data = torch.tensor(b4.flatten(), dtype=torch.float64)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        return x

# Create PyTorch model
torch_mlp4 = PyTorchMLP4(
    mlp4.W1, mlp4.b1, mlp4.W2, mlp4.b2,
    mlp4.W3, mlp4.b3, mlp4.W4, mlp4.b4
).double()

X_t = torch.tensor(X, dtype=torch.float64)
y_t = torch.tensor(y, dtype=torch.float64)

y_hat_t = torch_mlp4(X_t)
loss_t = torch.mean((y_hat_t - y_t) ** 2)
loss_t.backward()

# Get manual gradients
grads = mlp4.backward(y)

# Compare
print("Gradient Verification for 4-Layer MLP")
print("=" * 50)

comparisons = [
    ('W1', grads['dW1'], torch_mlp4.fc1.weight.grad.numpy().T),
    ('W2', grads['dW2'], torch_mlp4.fc2.weight.grad.numpy().T),
    ('W3', grads['dW3'], torch_mlp4.fc3.weight.grad.numpy().T),
    ('W4', grads['dW4'], torch_mlp4.fc4.weight.grad.numpy().T),
]

all_match = True
for name, manual, pytorch in comparisons:
    max_diff = np.abs(manual - pytorch).max()
    match = max_diff < 1e-6
    all_match = all_match and match
    status = "✅" if match else "❌"
    print(f"{status} {name}: max diff = {max_diff:.2e}")

if all_match:
    print("\n🎉 All 4-layer gradients match!")

## Solution 3: Binary Cross-Entropy Loss

**Challenge:** Implement BCE loss and show it equals Bernoulli NLL.

In [None]:
# SOLUTION: Binary Cross-Entropy implementation

def binary_cross_entropy(y_true, y_pred, eps=1e-10):
    """
    Binary Cross-Entropy Loss.
    
    BCE = -[y*log(p) + (1-y)*log(1-p)]
    """
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def bce_derivative(y_true, y_pred, eps=1e-10):
    """
    Derivative of BCE with respect to y_pred.
    
    ∂BCE/∂ŷ = -y/ŷ + (1-y)/(1-ŷ)
    """
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return -y_true / y_pred + (1 - y_true) / (1 - y_pred)

# When combined with sigmoid, the gradient simplifies!
def bce_sigmoid_gradient(y_true, y_pred):
    """
    Combined BCE + sigmoid gradient.
    
    ∂BCE/∂z = ŷ - y  (beautiful simplification!)
    """
    return y_pred - y_true

# Demonstrate
y_true = np.array([1, 0, 1, 1, 0])
z = np.array([2.0, -1.5, 1.0, 0.5, -2.0])  # Logits

# Forward
y_pred = 1 / (1 + np.exp(-z))  # Sigmoid
bce_losses = binary_cross_entropy(y_true, y_pred)

print("Binary Cross-Entropy Example")
print("=" * 50)
print(f"True labels: {y_true}")
print(f"Logits:      {z}")
print(f"Predictions: {y_pred.round(4)}")
print(f"BCE losses:  {bce_losses.round(4)}")
print(f"Mean BCE:    {bce_losses.mean():.4f}")

# Show gradient simplification
print("\nGradient with BCE + Sigmoid:")
grad_full = bce_derivative(y_true, y_pred) * (y_pred * (1 - y_pred))  # Chain rule
grad_simple = bce_sigmoid_gradient(y_true, y_pred)
print(f"Full chain rule: {grad_full.round(4)}")
print(f"Simplified (ŷ-y): {grad_simple.round(4)}")
print(f"\nThey match! This is why sigmoid + BCE is elegant.")

In [None]:
# DERIVATION: Why ∂BCE/∂z = ŷ - y for sigmoid

print("Mathematical Derivation")
print("=" * 50)
print()
print("BCE = -[y·log(ŷ) + (1-y)·log(1-ŷ)]")
print()
print("Where ŷ = σ(z) = 1/(1+e^(-z))")
print()
print("Step 1: ∂BCE/∂ŷ = -y/ŷ + (1-y)/(1-ŷ)")
print()
print("Step 2: ∂ŷ/∂z = ŷ(1-ŷ)  (sigmoid derivative)")
print()
print("Step 3: Chain rule")
print("  ∂BCE/∂z = ∂BCE/∂ŷ × ∂ŷ/∂z")
print("         = [-y/ŷ + (1-y)/(1-ŷ)] × ŷ(1-ŷ)")
print("         = -y(1-ŷ) + (1-y)ŷ")
print("         = -y + yŷ + ŷ - yŷ")
print("         = ŷ - y")
print()
print("✨ The gradient is simply: prediction - target!")

---

## Key Takeaways

1. **ReLU derivative** is 1 for positive inputs, 0 for negative
2. **Adding layers** follows the same pattern: propagate gradient back through each
3. **BCE + sigmoid** gives elegant gradient: ŷ - y
4. **Always verify** your gradients with numerical approximation or autograd