# Lab 1.7.2 Solutions: Layer Implementation

This notebook contains solutions to the exercises from Lab 1.7.2.

---

## Exercise 1 Solution: Custom Activation Function (LeakyReLU)

Implement LeakyReLU: f(x) = x if x > 0, else alpha * x

In [None]:
import numpy as np
import sys
from pathlib import Path

def _find_module_root():
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / 'micrograd_plus' / '__init__.py').exists():
            return str(parent)
    return str(Path.cwd().parent)

sys.path.insert(0, _find_module_root())

from micrograd_plus import Tensor
from micrograd_plus.layers import Module

In [None]:
class LeakyReLU(Module):
    """
    LeakyReLU activation function.
    
    f(x) = x if x > 0
    f(x) = alpha * x if x <= 0
    
    This prevents "dying ReLU" by allowing small negative gradients.
    """
    
    def __init__(self, negative_slope=0.01):
        super().__init__()
        self.negative_slope = negative_slope
    
    def forward(self, x: Tensor) -> Tensor:
        # For x > 0: output = x
        # For x <= 0: output = alpha * x
        
        positive_mask = (x.data > 0).astype(np.float32)
        negative_mask = (x.data <= 0).astype(np.float32)
        
        # Create output
        out_data = x.data * positive_mask + self.negative_slope * x.data * negative_mask
        out = Tensor(out_data, requires_grad=x.requires_grad)
        
        if x.requires_grad:
            out._prev = {x}
            out._op = 'leaky_relu'
            
            def _backward():
                # Gradient is 1 for positive, alpha for negative
                grad_mask = positive_mask + self.negative_slope * negative_mask
                x.grad = x.grad + out.grad * grad_mask if x.grad is not None else out.grad * grad_mask
            
            out._backward = _backward
        
        return out
    
    def __repr__(self):
        return f"LeakyReLU(negative_slope={self.negative_slope})"

In [None]:
# Test LeakyReLU
print("Testing LeakyReLU")
print("=" * 50)

leaky_relu = LeakyReLU(negative_slope=0.1)

# Test forward pass
x = Tensor([-2.0, -1.0, 0.0, 1.0, 2.0], requires_grad=True)
y = leaky_relu(x)

print(f"Input:  {x.data}")
print(f"Output: {y.data}")
print(f"Expected: [-0.2, -0.1, 0.0, 1.0, 2.0]")

# Test backward pass
y.sum().backward()
print(f"\nGradient: {x.grad}")
print(f"Expected: [0.1, 0.1, 0.1, 1.0, 1.0]")

---

## Exercise 2 Solution: Implement ELU Activation

ELU: f(x) = x if x > 0, else alpha * (exp(x) - 1)

In [None]:
class ELU(Module):
    """
    Exponential Linear Unit.
    
    f(x) = x if x > 0
    f(x) = alpha * (exp(x) - 1) if x <= 0
    
    Properties:
    - Smooth everywhere (unlike ReLU)
    - Outputs can be negative (unlike ReLU)
    - Helps push mean activations toward zero
    """
    
    def __init__(self, alpha=1.0):
        super().__init__()
        self.alpha = alpha
    
    def forward(self, x: Tensor) -> Tensor:
        positive_mask = (x.data > 0).astype(np.float32)
        negative_mask = (x.data <= 0).astype(np.float32)
        
        # Compute ELU
        positive_part = x.data * positive_mask
        negative_part = self.alpha * (np.exp(x.data) - 1) * negative_mask
        out_data = positive_part + negative_part
        
        out = Tensor(out_data, requires_grad=x.requires_grad)
        
        if x.requires_grad:
            out._prev = {x}
            out._op = 'elu'
            
            def _backward():
                # Gradient: 1 for x > 0, alpha * exp(x) for x <= 0
                grad = positive_mask + self.alpha * np.exp(x.data) * negative_mask
                x.grad = x.grad + out.grad * grad if x.grad is not None else out.grad * grad
            
            out._backward = _backward
        
        return out
    
    def __repr__(self):
        return f"ELU(alpha={self.alpha})"

In [None]:
# Test ELU
print("Testing ELU")
print("=" * 50)

elu = ELU(alpha=1.0)

x = Tensor([-2.0, -1.0, 0.0, 1.0, 2.0], requires_grad=True)
y = elu(x)

print(f"Input:  {x.data}")
print(f"Output: {y.data}")

# Expected: [alpha*(exp(-2)-1), alpha*(exp(-1)-1), 0, 1, 2]
expected = [1.0*(np.exp(-2)-1), 1.0*(np.exp(-1)-1), 0.0, 1.0, 2.0]
print(f"Expected: {expected}")

---

## Exercise 3 Solution: GELU Activation

GELU (Gaussian Error Linear Unit) is used in transformers like BERT and GPT.

In [None]:
class GELU(Module):
    """
    Gaussian Error Linear Unit.
    
    GELU(x) = x * Phi(x) where Phi is the CDF of standard normal distribution.
    
    Approximation: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
    
    Used in BERT, GPT-2, and other transformers.
    """
    
    def __init__(self, approximate=True):
        super().__init__()
        self.approximate = approximate
    
    def forward(self, x: Tensor) -> Tensor:
        if self.approximate:
            # Approximate GELU using tanh
            sqrt_2_pi = np.sqrt(2.0 / np.pi)
            cdf_approx = 0.5 * (1.0 + np.tanh(sqrt_2_pi * (x.data + 0.044715 * x.data ** 3)))
            out_data = x.data * cdf_approx
        else:
            # Exact GELU using error function
            from scipy.special import erf
            cdf = 0.5 * (1.0 + erf(x.data / np.sqrt(2.0)))
            out_data = x.data * cdf
        
        out = Tensor(out_data, requires_grad=x.requires_grad)
        
        if x.requires_grad:
            out._prev = {x}
            out._op = 'gelu'
            
            def _backward():
                if self.approximate:
                    # Derivative of approximate GELU
                    sqrt_2_pi = np.sqrt(2.0 / np.pi)
                    inner = sqrt_2_pi * (x.data + 0.044715 * x.data ** 3)
                    tanh_inner = np.tanh(inner)
                    sech2 = 1.0 - tanh_inner ** 2
                    inner_deriv = sqrt_2_pi * (1 + 3 * 0.044715 * x.data ** 2)
                    
                    grad = 0.5 * (1 + tanh_inner) + 0.5 * x.data * sech2 * inner_deriv
                else:
                    from scipy.special import erf
                    cdf = 0.5 * (1.0 + erf(x.data / np.sqrt(2.0)))
                    pdf = np.exp(-0.5 * x.data ** 2) / np.sqrt(2 * np.pi)
                    grad = cdf + x.data * pdf
                
                x.grad = x.grad + out.grad * grad if x.grad is not None else out.grad * grad
            
            out._backward = _backward
        
        return out
    
    def __repr__(self):
        return f"GELU(approximate={self.approximate})"

In [None]:
# Test GELU
print("Testing GELU")
print("=" * 50)

gelu = GELU(approximate=True)

x = Tensor([-2.0, -1.0, 0.0, 1.0, 2.0], requires_grad=True)
y = gelu(x)

print(f"Input:  {x.data}")
print(f"Output: {y.data}")

# Compare with expected values
print(f"\nGELU(0) = {y.data[2]:.4f} (expected ~0)")
print(f"GELU(1) = {y.data[3]:.4f} (expected ~0.841)")
print(f"GELU(-1) = {y.data[1]:.4f} (expected ~-0.159)")

---

## Exercise 4 Solution: Implement a Custom Layer (Residual Block)

Implement a residual connection: output = F(x) + x

In [None]:
from micrograd_plus import Linear, ReLU, Sequential

class ResidualBlock(Module):
    """
    Residual Block with skip connection.
    
    output = F(x) + x
    
    Where F is typically two linear layers with activation.
    Skip connections help with:
    - Gradient flow in deep networks
    - Training deeper models
    - Learning identity mappings when needed
    """
    
    def __init__(self, features, hidden_factor=4):
        super().__init__()
        hidden = features * hidden_factor
        
        self.block = Sequential(
            Linear(features, hidden),
            ReLU(),
            Linear(hidden, features)
        )
        self.relu = ReLU()
    
    def forward(self, x: Tensor) -> Tensor:
        # F(x) + x
        residual = x
        out = self.block(x)
        out = out + residual  # Skip connection
        out = self.relu(out)
        return out
    
    def parameters(self):
        return self.block.parameters()
    
    def __repr__(self):
        return f"ResidualBlock({self.block})"

In [None]:
# Test Residual Block
print("Testing ResidualBlock")
print("=" * 50)

np.random.seed(42)
block = ResidualBlock(features=8)

x = Tensor(np.random.randn(4, 8).astype(np.float32), requires_grad=True)
y = block(x)

print(f"Input shape:  {x.shape}")
print(f"Output shape: {y.shape}")
print(f"Parameters: {sum(p.data.size for p in block.parameters())}")

# Test gradient flow
y.sum().backward()
print(f"Input gradient exists: {x.grad is not None}")

---

## Challenge Solution: Multi-Head Attention Layer

Implement scaled dot-product attention used in transformers.

In [None]:
class MultiHeadAttention(Module):
    """
    Multi-Head Attention mechanism from "Attention Is All You Need".
    
    Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) V
    
    Multi-head allows the model to jointly attend to information
    from different representation subspaces.
    """
    
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5
        
        # Linear projections for Q, K, V
        self.q_proj = Linear(embed_dim, embed_dim)
        self.k_proj = Linear(embed_dim, embed_dim)
        self.v_proj = Linear(embed_dim, embed_dim)
        self.out_proj = Linear(embed_dim, embed_dim)
    
    def forward(self, query: Tensor, key: Tensor = None, value: Tensor = None) -> Tensor:
        """Self-attention when key/value not provided."""
        if key is None:
            key = query
        if value is None:
            value = query
        
        batch_size, seq_len, _ = query.shape
        
        # Project Q, K, V
        Q = self.q_proj(query)  # (batch, seq, embed)
        K = self.k_proj(key)
        V = self.v_proj(value)
        
        # Reshape for multi-head: (batch, seq, heads, head_dim)
        Q = Q.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
        K = K.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
        V = V.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
        
        # Transpose to (batch, heads, seq, head_dim)
        Q = Q.transpose(0, 2, 1, 3)
        K = K.transpose(0, 2, 1, 3)
        V = V.transpose(0, 2, 1, 3)
        
        # Compute attention scores: (batch, heads, seq, seq)
        scores = (Q @ K.transpose(0, 1, 3, 2)) * self.scale
        
        # Apply softmax
        attn_weights = scores.softmax(axis=-1)
        
        # Apply attention to values: (batch, heads, seq, head_dim)
        attn_output = attn_weights @ V
        
        # Reshape back: (batch, seq, embed)
        attn_output = attn_output.transpose(0, 2, 1, 3)
        attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)
        
        # Final projection
        output = self.out_proj(attn_output)
        
        return output
    
    def parameters(self):
        params = []
        for proj in [self.q_proj, self.k_proj, self.v_proj, self.out_proj]:
            params.extend(proj.parameters())
        return params
    
    def __repr__(self):
        return f"MultiHeadAttention(embed_dim={self.embed_dim}, num_heads={self.num_heads})"

In [None]:
# Test Multi-Head Attention
print("Testing MultiHeadAttention")
print("=" * 50)

np.random.seed(42)
mha = MultiHeadAttention(embed_dim=64, num_heads=8)

# Input: (batch, sequence_length, embed_dim)
x = Tensor(np.random.randn(2, 10, 64).astype(np.float32), requires_grad=True)
y = mha(x)

print(f"Input shape:  {x.shape}")
print(f"Output shape: {y.shape}")
print(f"Parameters: {sum(p.data.size for p in mha.parameters())}")

# Test gradient flow
y.sum().backward()
print(f"Gradient computed: {x.grad is not None}")

---

## Key Takeaways

1. **Custom Activations**: Can be implemented by defining forward/backward with proper gradient computation

2. **LeakyReLU**: Allows small negative gradients (prevents dying ReLU)

3. **ELU**: Smooth activation with negative values to center activations

4. **GELU**: Gaussian-gated activation used in modern transformers

5. **Residual Blocks**: Skip connections enable training much deeper networks

6. **Multi-Head Attention**: Core building block of transformers