# Module 02: Supervised Learning Fundamentals

This notebook covers the core concepts of supervised learning that form the foundation for training neural networks.

## Learning Objectives
- Understand loss functions and their role in training
- Implement gradient descent from scratch
- Learn about backpropagation
- Explore common optimizers (SGD, Adam, AdamW)
- Build a complete training loop

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("[OK] Libraries loaded")

## 1. Loss Functions

Loss functions measure how far our predictions are from the targets. They provide the signal that guides learning.

In [None]:
# Mean Squared Error (MSE) for regression
def mse_loss(predictions, targets):
    """Compute Mean Squared Error."""
    return ((predictions - targets) ** 2).mean()

# Cross-Entropy for classification
def cross_entropy_loss(logits, targets):
    """Compute Cross-Entropy Loss."""
    log_probs = F.log_softmax(logits, dim=-1)
    return F.nll_loss(log_probs, targets)

# Example: Compare our implementation with PyTorch
predictions = torch.randn(10, 5)  # 10 samples, 5 classes
targets = torch.randint(0, 5, (10,))  # class labels

our_loss = cross_entropy_loss(predictions, targets)
torch_loss = nn.CrossEntropyLoss()(predictions, targets)

print(f"Our CE Loss: {our_loss.item():.4f}")
print(f"PyTorch CE Loss: {torch_loss.item():.4f}")
print(f"[OK] Difference: {abs(our_loss - torch_loss).item():.6f}")

## 2. Gradient Descent

Gradient descent updates parameters in the direction that reduces the loss.

In [None]:
# Visualize gradient descent on a simple 2D function
def quadratic(x):
    return x**2 + 2*x + 1  # (x+1)^2, minimum at x=-1

def gradient(x):
    return 2*x + 2

# Run gradient descent
x = 3.0  # starting point
learning_rate = 0.1
history = [x]

for _ in range(20):
    grad = gradient(x)
    x = x - learning_rate * grad
    history.append(x)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

x_range = np.linspace(-4, 4, 100)
ax1.plot(x_range, quadratic(x_range), 'b-', label='f(x) = (x+1)^2')
ax1.scatter(history, [quadratic(h) for h in history], c=range(len(history)), cmap='Reds', s=50, zorder=5)
ax1.set_xlabel('x')
ax1.set_ylabel('f(x)')
ax1.set_title('Gradient Descent Path')
ax1.legend()

ax2.plot(history, 'o-')
ax2.axhline(-1, color='g', linestyle='--', label='Optimum (x=-1)')
ax2.set_xlabel('Iteration')
ax2.set_ylabel('x value')
ax2.set_title('Convergence to Minimum')
ax2.legend()

plt.tight_layout()
plt.show()
print(f"[OK] Converged to x = {history[-1]:.4f} (target: -1.0)")

## 3. Backpropagation

Backpropagation computes gradients by applying the chain rule through the network.

In [None]:
# Simple 2-layer network for demonstration
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Create model and sample data
model = SimpleMLP(10, 32, 3)
x = torch.randn(4, 10)
y = torch.randint(0, 3, (4,))

# Forward pass
output = model(x)
loss = nn.CrossEntropyLoss()(output, y)

# Backward pass - computes all gradients
loss.backward()

# Check gradients
print("Gradient shapes:")
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"  {name}: {param.grad.shape}")

print(f"\n[OK] Loss: {loss.item():.4f}")

## 4. Optimizers

Different optimizers use gradients in different ways to update parameters.

In [None]:
# Compare optimizers on a simple problem
def train_with_optimizer(optimizer_class, lr=0.01, epochs=100):
    torch.manual_seed(42)
    
    # Simple linear regression
    X = torch.randn(100, 5)
    true_w = torch.randn(5, 1)
    y = X @ true_w + 0.1 * torch.randn(100, 1)
    
    model = nn.Linear(5, 1)
    optimizer = optimizer_class(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    losses = []
    for _ in range(epochs):
        optimizer.zero_grad()
        pred = model(X)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    
    return losses

# Train with different optimizers
sgd_losses = train_with_optimizer(torch.optim.SGD, lr=0.01)
adam_losses = train_with_optimizer(torch.optim.Adam, lr=0.01)
adamw_losses = train_with_optimizer(torch.optim.AdamW, lr=0.01)

plt.figure(figsize=(10, 5))
plt.plot(sgd_losses, label='SGD')
plt.plot(adam_losses, label='Adam')
plt.plot(adamw_losses, label='AdamW')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Optimizer Comparison')
plt.legend()
plt.yscale('log')
plt.show()

print(f"[OK] Final losses - SGD: {sgd_losses[-1]:.4f}, Adam: {adam_losses[-1]:.4f}, AdamW: {adamw_losses[-1]:.4f}")

## 5. Complete Training Loop

Putting it all together with proper train/validation split.

In [None]:
# Generate synthetic classification data
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=1000, n_features=20, n_classes=3, 
                           n_informative=10, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to tensors
X_train = torch.FloatTensor(X_train)
y_train = torch.LongTensor(y_train)
X_val = torch.FloatTensor(X_val)
y_val = torch.LongTensor(y_val)

# Create data loaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

print(f"[OK] Data loaded: {len(X_train)} train, {len(X_val)} validation samples")

In [None]:
# Training loop
model = SimpleMLP(20, 64, 3)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(50):
    # Training
    model.train()
    epoch_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    train_losses.append(epoch_loss / len(train_loader))
    
    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            output = model(batch_x)
            val_loss += criterion(output, batch_y).item()
            _, predicted = output.max(1)
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)
    
    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100 * correct / total)
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}: Train Loss={train_losses[-1]:.4f}, Val Loss={val_losses[-1]:.4f}, Val Acc={val_accuracies[-1]:.1f}%")

print(f"\n[OK] Training complete. Final validation accuracy: {val_accuracies[-1]:.1f}%")

In [None]:
# Plot training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.plot(train_losses, label='Train')
ax1.plot(val_losses, label='Validation')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Loss Curves')
ax1.legend()

ax2.plot(val_accuracies)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Validation Accuracy')

plt.tight_layout()
plt.show()

## 6. Regularization

Regularization techniques help prevent overfitting.

In [None]:
# Dropout example
class RegularizedMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.5):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)  # Only active during training
        x = self.fc2(x)
        return x

# L2 regularization via weight_decay in optimizer
model = RegularizedMLP(20, 64, 3, dropout=0.3)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)  # weight_decay = L2

print("[OK] Regularization techniques:")
print("  - Dropout: 30% (randomly zeros activations during training)")
print("  - Weight Decay: 0.01 (L2 penalty on weights)")

## Summary

In this notebook, we covered:

1. **Loss Functions**: MSE for regression, Cross-Entropy for classification
2. **Gradient Descent**: Iteratively updating parameters to minimize loss
3. **Backpropagation**: Computing gradients through the chain rule
4. **Optimizers**: SGD, Adam, AdamW and their differences
5. **Training Loop**: Combining everything with train/val split
6. **Regularization**: Dropout and weight decay to prevent overfitting

## Next Steps

- [->] Module 03: Graph Theory Basics
- [->] Module 04: Small-World Networks
- [->] Module 05: Sparse Neural Networks