# Lab 1.7.3 Solutions: Loss Functions and Optimizers

This notebook contains solutions to the exercises from Lab 1.7.3.

---

In [None]:
import numpy as np
import sys
from pathlib import Path

def _find_module_root():
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / 'micrograd_plus' / '__init__.py').exists():
            return str(parent)
    return str(Path.cwd().parent)

sys.path.insert(0, _find_module_root())

from micrograd_plus import Tensor, Linear, ReLU, Sequential
from micrograd_plus.losses import Loss
from micrograd_plus.optimizers import Optimizer

## Exercise 1 Solution: Focal Loss

Focal Loss addresses class imbalance by down-weighting easy examples.

FL(p_t) = -alpha_t * (1 - p_t)^gamma * log(p_t)

In [None]:
class FocalLoss(Loss):
    """
    Focal Loss for handling class imbalance.
    
    FL(p_t) = -alpha_t * (1 - p_t)^gamma * log(p_t)
    
    Where:
    - p_t is the probability of the correct class
    - alpha is the class weight (optional)
    - gamma is the focusing parameter
    
    When gamma=0, this reduces to standard cross-entropy.
    Higher gamma focuses more on hard examples.
    """
    
    def __init__(self, alpha=1.0, gamma=2.0, reduction='mean'):
        super().__init__(reduction)
        self.alpha = alpha
        self.gamma = gamma
    
    def forward(self, predictions: Tensor, targets: Tensor) -> Tensor:
        # Apply softmax to get probabilities
        logits = predictions.data
        
        # Numerically stable softmax
        logits_max = np.max(logits, axis=-1, keepdims=True)
        exp_logits = np.exp(logits - logits_max)
        probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
        
        # Get probability of correct class
        batch_size = logits.shape[0]
        target_indices = targets.data.astype(np.int32)
        p_t = probs[np.arange(batch_size), target_indices]
        
        # Compute focal weight: (1 - p_t)^gamma
        focal_weight = (1 - p_t) ** self.gamma
        
        # Compute focal loss
        ce_loss = -np.log(p_t + 1e-8)
        focal_loss = self.alpha * focal_weight * ce_loss
        
        # Reduction
        if self.reduction == 'mean':
            loss_value = np.mean(focal_loss)
        elif self.reduction == 'sum':
            loss_value = np.sum(focal_loss)
        else:
            loss_value = focal_loss
        
        out = Tensor(loss_value, requires_grad=predictions.requires_grad)
        
        if predictions.requires_grad:
            out._prev = {predictions}
            out._op = 'focal_loss'
            
            def _backward():
                # Gradient of focal loss
                grad = probs.copy()
                grad[np.arange(batch_size), target_indices] -= 1
                
                # Apply focal weight to gradient
                focal_grad = self.alpha * (
                    self.gamma * (1 - p_t) ** (self.gamma - 1) * p_t * np.log(p_t + 1e-8)
                    + (1 - p_t) ** self.gamma
                )
                grad *= focal_grad[:, np.newaxis]
                
                if self.reduction == 'mean':
                    grad /= batch_size
                
                predictions.grad = predictions.grad + out.grad * grad if predictions.grad is not None else out.grad * grad
            
            out._backward = _backward
        
        return out

In [None]:
# Test Focal Loss
print("Testing Focal Loss")
print("=" * 50)

# Create imbalanced scenario: easy and hard examples
logits_easy = Tensor([[5.0, 0.0, 0.0]])  # Very confident about class 0
logits_hard = Tensor([[0.5, 0.3, 0.2]])  # Not confident

targets = Tensor([0])  # True class is 0

focal = FocalLoss(gamma=2.0)
ce = FocalLoss(gamma=0.0)  # Standard cross-entropy

print("Easy example (high confidence):")
print(f"  Cross-Entropy: {ce(logits_easy, targets).data:.4f}")
print(f"  Focal Loss:    {focal(logits_easy, targets).data:.4f}")

print("\nHard example (low confidence):")
print(f"  Cross-Entropy: {ce(logits_hard, targets).data:.4f}")
print(f"  Focal Loss:    {focal(logits_hard, targets).data:.4f}")

print("\nFocal loss focuses more on hard examples!")

---

## Exercise 2 Solution: Smooth L1 Loss (Huber Loss)

Combines L1 and L2 loss for robustness to outliers.

In [None]:
class SmoothL1Loss(Loss):
    """
    Smooth L1 Loss (Huber Loss).
    
    L(x) = 0.5 * x^2           if |x| < beta
    L(x) = |x| - 0.5 * beta    otherwise
    
    Benefits:
    - Less sensitive to outliers than L2
    - Smoother than L1 near zero
    - Used in object detection (Faster R-CNN)
    """
    
    def __init__(self, beta=1.0, reduction='mean'):
        super().__init__(reduction)
        self.beta = beta
    
    def forward(self, predictions: Tensor, targets: Tensor) -> Tensor:
        diff = predictions.data - targets.data
        abs_diff = np.abs(diff)
        
        # Compute smooth L1
        small_mask = abs_diff < self.beta
        loss = np.where(
            small_mask,
            0.5 * diff ** 2 / self.beta,
            abs_diff - 0.5 * self.beta
        )
        
        # Reduction
        if self.reduction == 'mean':
            loss_value = np.mean(loss)
        elif self.reduction == 'sum':
            loss_value = np.sum(loss)
        else:
            loss_value = loss
        
        out = Tensor(loss_value, requires_grad=predictions.requires_grad)
        
        if predictions.requires_grad:
            out._prev = {predictions}
            out._op = 'smooth_l1'
            
            def _backward():
                # Gradient: x/beta if |x| < beta, else sign(x)
                grad = np.where(
                    small_mask,
                    diff / self.beta,
                    np.sign(diff)
                )
                
                if self.reduction == 'mean':
                    grad /= predictions.data.size
                
                predictions.grad = predictions.grad + out.grad * grad if predictions.grad is not None else out.grad * grad
            
            out._backward = _backward
        
        return out

In [None]:
# Test Smooth L1 Loss
print("Testing Smooth L1 Loss")
print("=" * 50)

from micrograd_plus import MSELoss

pred = Tensor([0.0, 1.0, 5.0, 10.0], requires_grad=True)  # Include outlier (10.0)
target = Tensor([0.0, 0.0, 0.0, 0.0])

smooth_l1 = SmoothL1Loss(beta=1.0)
mse = MSELoss()

loss_smooth = smooth_l1(pred, target)
loss_mse = mse(pred, target)

print(f"Predictions: {pred.data}")
print(f"Targets:     {target.data}")
print(f"\nSmooth L1 Loss: {loss_smooth.data:.4f}")
print(f"MSE Loss:       {loss_mse.data:.4f}")
print("\nSmooth L1 is less affected by the outlier (10.0)!")

---

## Exercise 3 Solution: Contrastive Loss

Used for learning embeddings (Siamese networks).

In [None]:
class ContrastiveLoss(Loss):
    """
    Contrastive Loss for learning embeddings.
    
    L = (1-Y) * 0.5 * D^2 + Y * 0.5 * max(0, margin - D)^2
    
    Where:
    - D is the distance between embeddings
    - Y is 1 for dissimilar pairs, 0 for similar pairs
    - margin is the minimum distance for dissimilar pairs
    
    Used in:
    - Face verification (Siamese networks)
    - Learning similarity metrics
    """
    
    def __init__(self, margin=1.0, reduction='mean'):
        super().__init__(reduction)
        self.margin = margin
    
    def forward(self, embedding1: Tensor, embedding2: Tensor, labels: Tensor) -> Tensor:
        """
        Args:
            embedding1: First set of embeddings (batch, dim)
            embedding2: Second set of embeddings (batch, dim)
            labels: 0 for similar, 1 for dissimilar pairs
        """
        # Compute Euclidean distance
        diff = embedding1.data - embedding2.data
        distance_sq = np.sum(diff ** 2, axis=-1)
        distance = np.sqrt(distance_sq + 1e-8)
        
        # Contrastive loss
        y = labels.data
        margin_distance = np.maximum(0, self.margin - distance)
        
        loss = (1 - y) * 0.5 * distance_sq + y * 0.5 * margin_distance ** 2
        
        # Reduction
        if self.reduction == 'mean':
            loss_value = np.mean(loss)
        elif self.reduction == 'sum':
            loss_value = np.sum(loss)
        else:
            loss_value = loss
        
        out = Tensor(loss_value, requires_grad=embedding1.requires_grad)
        
        if embedding1.requires_grad:
            out._prev = {embedding1, embedding2}
            out._op = 'contrastive'
            
            def _backward():
                # Gradient w.r.t. embeddings
                grad1 = (1 - y)[:, np.newaxis] * diff
                
                dissimilar_grad = np.where(
                    distance[:, np.newaxis] < self.margin,
                    -margin_distance[:, np.newaxis] * diff / (distance[:, np.newaxis] + 1e-8),
                    0
                )
                grad1 += y[:, np.newaxis] * dissimilar_grad
                
                if self.reduction == 'mean':
                    grad1 /= labels.data.size
                
                embedding1.grad = embedding1.grad + out.grad * grad1 if embedding1.grad is not None else out.grad * grad1
                embedding2.grad = embedding2.grad - out.grad * grad1 if embedding2.grad is not None else -out.grad * grad1
            
            out._backward = _backward
        
        return out

In [None]:
# Test Contrastive Loss
print("Testing Contrastive Loss")
print("=" * 50)

np.random.seed(42)

# Create embedding pairs
e1 = Tensor(np.random.randn(4, 8).astype(np.float32), requires_grad=True)
e2_similar = Tensor(e1.data + 0.1 * np.random.randn(4, 8).astype(np.float32))  # Close to e1
e2_dissimilar = Tensor(np.random.randn(4, 8).astype(np.float32))  # Far from e1

labels_similar = Tensor(np.zeros(4))  # 0 = similar
labels_dissimilar = Tensor(np.ones(4))  # 1 = dissimilar

contrastive = ContrastiveLoss(margin=1.0)

loss_similar = contrastive(e1, e2_similar, labels_similar)
loss_dissimilar = contrastive(e1, e2_dissimilar, labels_dissimilar)

print(f"Similar pair loss:    {loss_similar.data:.4f} (should be small)")
print(f"Dissimilar pair loss: {loss_dissimilar.data:.4f}")

---

## Exercise 4 Solution: Implement AdaGrad Optimizer

AdaGrad adapts learning rates based on historical gradients.

In [None]:
class Adagrad(Optimizer):
    """
    Adagrad optimizer.
    
    Adapts learning rate for each parameter based on historical gradients.
    
    Update rule:
    G_t = G_{t-1} + grad^2
    param = param - lr * grad / sqrt(G_t + eps)
    
    Pros:
    - Adapts learning rate per parameter
    - Good for sparse data
    
    Cons:
    - Learning rate decays too aggressively over time
    """
    
    def __init__(self, params, lr=0.01, eps=1e-8, weight_decay=0.0):
        super().__init__(params, lr)
        self.eps = eps
        self.weight_decay = weight_decay
        
        # Initialize sum of squared gradients
        self.sum_sq = [np.zeros_like(p.data) for p in self.params]
    
    def step(self):
        for i, param in enumerate(self.params):
            if param.grad is None:
                continue
            
            grad = param.grad
            
            # Apply weight decay
            if self.weight_decay != 0:
                grad = grad + self.weight_decay * param.data
            
            # Accumulate squared gradients
            self.sum_sq[i] += grad ** 2
            
            # Update parameter
            param.data -= self.lr * grad / (np.sqrt(self.sum_sq[i]) + self.eps)

In [None]:
# Test Adagrad
print("Testing Adagrad Optimizer")
print("=" * 50)

np.random.seed(42)

# Simple optimization problem: minimize (x - 3)^2 + (y - 4)^2
x = Tensor([0.0], requires_grad=True)
y = Tensor([0.0], requires_grad=True)

optimizer = Adagrad([x, y], lr=1.0)

print("Optimizing f(x,y) = (x-3)^2 + (y-4)^2")
print(f"Initial: x={x.item():.2f}, y={y.item():.2f}")

for i in range(100):
    loss = (x - 3) ** 2 + (y - 4) ** 2
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f"Final:   x={x.item():.2f}, y={y.item():.2f}")
print(f"Expected: x=3.00, y=4.00")

---

## Challenge Solution: Learning Rate Warmup + Decay

Common in training transformers: linear warmup followed by cosine decay.

In [None]:
class WarmupCosineScheduler:
    """
    Learning rate scheduler with linear warmup and cosine decay.
    
    Used in training transformers (BERT, GPT, etc.).
    
    Schedule:
    1. Linear warmup: 0 -> max_lr over warmup_steps
    2. Cosine decay: max_lr -> min_lr over remaining steps
    """
    
    def __init__(self, optimizer, warmup_steps, total_steps, min_lr=0.0):
        self.optimizer = optimizer
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        self.min_lr = min_lr
        self.max_lr = optimizer.lr
        self.current_step = 0
    
    def step(self):
        self.current_step += 1
        
        if self.current_step <= self.warmup_steps:
            # Linear warmup
            lr = self.max_lr * (self.current_step / self.warmup_steps)
        else:
            # Cosine decay
            progress = (self.current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
            lr = self.min_lr + 0.5 * (self.max_lr - self.min_lr) * (1 + np.cos(np.pi * progress))
        
        self.optimizer.lr = lr
        return lr
    
    def get_lr(self):
        return self.optimizer.lr

In [None]:
# Visualize warmup + cosine schedule
import matplotlib.pyplot as plt

x = Tensor([0.0], requires_grad=True)
opt = Adagrad([x], lr=0.001)

scheduler = WarmupCosineScheduler(
    optimizer=opt,
    warmup_steps=100,
    total_steps=1000,
    min_lr=1e-6
)

lrs = []
for step in range(1000):
    lr = scheduler.step()
    lrs.append(lr)

plt.figure(figsize=(10, 4))
plt.plot(lrs)
plt.xlabel('Step')
plt.ylabel('Learning Rate')
plt.title('Warmup + Cosine Decay Schedule')
plt.axvline(x=100, color='r', linestyle='--', label='End of warmup')
plt.legend()
plt.grid(True)
plt.show()

---

## Key Takeaways

1. **Focal Loss**: Down-weights easy examples, focuses on hard ones. Great for class imbalance.

2. **Smooth L1 (Huber)**: Combines L1/L2, robust to outliers.

3. **Contrastive Loss**: Learns embeddings where similar items are close, dissimilar are far.

4. **Adagrad**: Adapts learning rate per parameter based on gradient history.

5. **Learning Rate Schedules**: Warmup prevents early training instability, decay improves convergence.