# Task 4.2 Solution: Activation Function Study

This notebook contains solutions to the exercises from Task 4.2.

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
%matplotlib inline

## Exercise 1 Solution: Implement PReLU

In [None]:
class PReLU:
    """
    Parametric ReLU - the slope for negative values is learned.
    
    PReLU(x) = x if x > 0 else alpha * x
    
    Where alpha is a learnable parameter!
    """
    
    def __init__(self, alpha_init: float = 0.01, lr: float = 0.01):
        self.alpha = alpha_init  # Learnable parameter
        self.lr = lr  # Learning rate for alpha
        self.cache = {}
        self.name = f'PReLU(alpha={alpha_init})'
    
    def forward(self, x: np.ndarray) -> np.ndarray:
        """Forward pass."""
        self.cache['x'] = x
        return np.where(x > 0, x, self.alpha * x)
    
    def backward(self, grad_output: np.ndarray) -> np.ndarray:
        """
        Backward pass.
        
        Computes gradients for:
        1. Input x (to pass to previous layer)
        2. Alpha parameter (to update it)
        """
        x = self.cache['x']
        
        # Gradient w.r.t. input
        grad_input = np.where(x > 0, 1.0, self.alpha) * grad_output
        
        # Gradient w.r.t. alpha: sum of (grad_output * x) where x < 0
        dalpha = np.sum(grad_output * x * (x < 0))
        
        # Update alpha
        self.alpha -= self.lr * dalpha
        
        return grad_input
    
    def __call__(self, x: np.ndarray) -> np.ndarray:
        return self.forward(x)

# Test PReLU
print("Testing PReLU:")
print("=" * 50)

prelu = PReLU(alpha_init=0.01, lr=0.001)

# Test forward
x = np.array([[-2, -1, 0, 1, 2]], dtype=float)
out = prelu(x)
print(f"Input:  {x[0]}")
print(f"Output: {out[0]}")
print(f"Initial alpha: {prelu.alpha}")

# Test backward (simulate learning)
grad = np.ones_like(x)
for i in range(100):
    prelu.forward(x)
    prelu.backward(grad)

print(f"\nAlpha after 100 updates: {prelu.alpha:.4f}")
print("(Alpha should have increased since we want stronger gradients for negative inputs)")

In [None]:
# Visualize PReLU with different alphas
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

x = np.linspace(-3, 3, 200)
alphas = [0.0, 0.1, 0.25, 0.5]

for alpha in alphas:
    prelu = PReLU(alpha_init=alpha)
    y = prelu(x.copy())
    axes[0].plot(x, y, linewidth=2, label=f'alpha={alpha}')

axes[0].axhline(0, color='k', linewidth=0.5)
axes[0].axvline(0, color='k', linewidth=0.5)
axes[0].set_xlabel('x')
axes[0].set_ylabel('PReLU(x)')
axes[0].set_title('PReLU with Different Alpha Values')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Compare with ReLU and LeakyReLU
relu = np.maximum(0, x)
leaky_relu = np.where(x > 0, x, 0.1 * x)
prelu_out = PReLU(0.25)(x.copy())

axes[1].plot(x, relu, linewidth=2, label='ReLU')
axes[1].plot(x, leaky_relu, linewidth=2, label='LeakyReLU (0.1)')
axes[1].plot(x, prelu_out, linewidth=2, label='PReLU (learned)')
axes[1].axhline(0, color='k', linewidth=0.5)
axes[1].axvline(0, color='k', linewidth=0.5)
axes[1].set_xlabel('x')
axes[1].set_ylabel('Activation')
axes[1].set_title('ReLU vs LeakyReLU vs PReLU')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Exercise 2 Solution: Deep Sigmoid vs ReLU Network

In [None]:
class DeepMLP:
    """MLP with configurable activation for gradient analysis."""
    
    def __init__(self, layer_sizes, activation='relu'):
        self.layers = []
        self.activation = activation
        self.gradient_norms = []
        
        for i in range(len(layer_sizes) - 1):
            W = np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * np.sqrt(2.0 / layer_sizes[i])
            b = np.zeros(layer_sizes[i + 1])
            self.layers.append({'W': W, 'b': b, 'cache': {}})
    
    def _activate(self, x):
        if self.activation == 'relu':
            return np.maximum(0, x)
        else:  # sigmoid
            return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def _activate_grad(self, x, grad):
        if self.activation == 'relu':
            return grad * (x > 0)
        else:  # sigmoid
            s = 1 / (1 + np.exp(-np.clip(x, -500, 500)))
            return grad * s * (1 - s)
    
    def forward(self, X):
        out = X
        for i, layer in enumerate(self.layers[:-1]):
            layer['cache']['X'] = out
            out = out @ layer['W'] + layer['b']
            layer['cache']['Z'] = out
            out = self._activate(out)
        
        self.layers[-1]['cache']['X'] = out
        out = out @ self.layers[-1]['W'] + self.layers[-1]['b']
        out_shifted = out - np.max(out, axis=1, keepdims=True)
        exp_out = np.exp(out_shifted)
        self.probs = exp_out / np.sum(exp_out, axis=1, keepdims=True)
        return self.probs
    
    def backward(self, targets, lr):
        batch_size = len(targets)
        grad = self.probs.copy()
        grad[np.arange(batch_size), targets] -= 1
        
        layer_grads = []
        
        for i in range(len(self.layers) - 1, -1, -1):
            layer = self.layers[i]
            X = layer['cache']['X']
            
            dW = X.T @ grad / batch_size
            layer_grads.append(np.linalg.norm(dW))
            
            grad = grad @ layer['W'].T
            
            if i > 0:
                Z = self.layers[i - 1]['cache']['Z']
                grad = self._activate_grad(Z, grad)
            
            layer['W'] -= lr * dW
        
        self.gradient_norms.append(layer_grads[::-1])
    
    def predict(self, X):
        return np.argmax(self.forward(X), axis=1)

# Load data
import gzip
import os
import urllib.request

def load_mnist(path='../data'):
    os.makedirs(path, exist_ok=True)
    base_url = 'http://yann.lecun.com/exdb/mnist/'
    files = ['train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz',
             't10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz']
    for f in files:
        fp = os.path.join(path, f)
        if not os.path.exists(fp): urllib.request.urlretrieve(base_url + f, fp)
    def load_img(fp): 
        with gzip.open(fp) as f: f.read(16); return np.frombuffer(f.read(), np.uint8).reshape(-1,784).astype(np.float32)/255
    def load_lbl(fp): 
        with gzip.open(fp) as f: f.read(8); return np.frombuffer(f.read(), np.uint8)
    return (load_img(os.path.join(path, files[0])), load_lbl(os.path.join(path, files[1])),
            load_img(os.path.join(path, files[2])), load_lbl(os.path.join(path, files[3])))

X_train, y_train, X_test, y_test = load_mnist()

In [None]:
# Compare 10-layer networks with Sigmoid vs ReLU
print("10-Layer Network Comparison: Sigmoid vs ReLU")
print("=" * 60)

# Deep architecture (10 layers)
arch = [784, 256, 256, 256, 256, 256, 256, 256, 128, 10]

# Train Sigmoid network
np.random.seed(42)
model_sigmoid = DeepMLP(arch, activation='sigmoid')
X_subset = X_train[:1000]
y_subset = y_train[:1000]

for epoch in range(3):
    for start in range(0, len(X_subset), 64):
        model_sigmoid.forward(X_subset[start:start+64])
        model_sigmoid.backward(y_subset[start:start+64], 0.1)

acc_sigmoid = np.mean(model_sigmoid.predict(X_test[:1000]) == y_test[:1000])

# Train ReLU network
np.random.seed(42)
model_relu = DeepMLP(arch, activation='relu')

for epoch in range(3):
    for start in range(0, len(X_subset), 64):
        model_relu.forward(X_subset[start:start+64])
        model_relu.backward(y_subset[start:start+64], 0.1)

acc_relu = np.mean(model_relu.predict(X_test[:1000]) == y_test[:1000])

print(f"Sigmoid Network Accuracy: {acc_sigmoid:.2%}")
print(f"ReLU Network Accuracy: {acc_relu:.2%}")

# Compare gradient norms
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Get final gradient norms per layer
sigmoid_grads = np.mean(model_sigmoid.gradient_norms[-10:], axis=0)
relu_grads = np.mean(model_relu.gradient_norms[-10:], axis=0)

x = np.arange(len(sigmoid_grads))
width = 0.35

axes[0].bar(x - width/2, sigmoid_grads, width, label='Sigmoid', color='red', alpha=0.7)
axes[0].bar(x + width/2, relu_grads, width, label='ReLU', color='green', alpha=0.7)
axes[0].set_xlabel('Layer (0 = closest to input)')
axes[0].set_ylabel('Gradient Norm')
axes[0].set_title('Gradient Magnitude by Layer')
axes[0].legend()
axes[0].set_yscale('log')
axes[0].grid(True, alpha=0.3)

# Show ratio
ratio = relu_grads / (sigmoid_grads + 1e-10)
axes[1].bar(x, ratio, color='blue', alpha=0.7)
axes[1].axhline(1, color='red', linestyle='--', label='Equal')
axes[1].set_xlabel('Layer')
axes[1].set_ylabel('ReLU / Sigmoid Gradient Ratio')
axes[1].set_title('How Much Stronger ReLU Gradients Are')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nðŸ’¡ Key Insight:")
print(f"   ReLU gradients in early layers are {ratio[0]:.0f}x stronger than Sigmoid!")
print(f"   This explains why deep Sigmoid networks are hard to train.")

---

## Key Takeaways

1. **PReLU** learns the negative slope, potentially better than fixed LeakyReLU
2. **Sigmoid gradients vanish** exponentially in deep networks
3. **ReLU maintains gradient flow** allowing training of very deep networks
4. Modern activations (GELU, SiLU) combine smoothness with good gradient flow