
# MLF Week 4: Neural Networks Part 2 - Training (Solutions)


Reference version with filled code.


## 0. Setup

In [None]:
from utils import *

import math, random, os, sys, time
from typing import Tuple, List

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import matplotlib.pyplot as plt

torch.manual_seed(0); random.seed(0); np.random.seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device



## 1. Backpropagation (Intuition)


Autograd records the computation graph during the forward pass. Calling `loss.backward()` applies the chain rule through that graph and populates gradients for leaf tensors.


### 1.1 Autograd mini demo

In [None]:
x = torch.tensor([2.0], requires_grad=True)
y = (3*x + 2)**2 / 2
y.backward()
print("dy/dx at x=2:", x.grad.item())



## 2. Data (two moons)

In [None]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

def make_toy_data(n_samples=1200, noise=0.2, test_size=0.2, seed=0):
    X, y = make_moons(n_samples=n_samples, noise=noise, random_state=seed)
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=seed)
    return (torch.tensor(X_train, dtype=torch.float32),
            torch.tensor(y_train, dtype=torch.long),
            torch.tensor(X_val,   dtype=torch.float32),
            torch.tensor(y_val,   dtype=torch.long))

X_train, y_train, X_val, y_val = make_toy_data()
train_ds = TensorDataset(X_train, y_train)
val_ds   = TensorDataset(X_val, y_val)

BATCH_SIZE = 64
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE)
len(train_ds), len(val_ds)



## 3. Model (MLP)

In [None]:
class MLP(nn.Module):
    def __init__(self, in_dim=2, hidden_sizes=(32, 32), out_dim=2):
        super().__init__()
        layers = []
        last = in_dim
        for h in hidden_sizes:
            layers += [nn.Linear(last, h), nn.ReLU()]
            last = h
        layers += [nn.Linear(last, out_dim)]
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)

model = MLP().to(device)
model



## 4. Training loop

In [None]:
def accuracy(model, loader, device=device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            pred = logits.argmax(dim=1)
            correct += (pred == yb).sum().item()
            total += yb.numel()
    return correct / max(1, total)

def train_model(model, train_loader, val_loader, epochs=50, lr=0.05, optimizer_name='sgd', device=device):
    if optimizer_name.lower() == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    train_losses, val_losses, val_accs = [], [], []
    for ep in range(1, epochs+1):
        model.train()
        running = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running += loss.item()

        train_losses.append(running / len(train_loader))

        model.eval()
        with torch.no_grad():
            vloss = 0.0
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                vloss += criterion(model(xb), yb).item()
            vloss /= len(val_loader)
        val_losses.append(vloss)
        val_accs.append(accuracy(model, val_loader, device=device))

    return {"train_losses": train_losses, "val_losses": val_losses, "val_accs": val_accs}



## 5. Baseline run (SGD)

In [None]:
EPOCHS = 50
LR = 0.05
OPT = 'sgd'  # or 'adam'

model = MLP().to(device)
out = train_model(model, train_loader, val_loader, epochs=EPOCHS, lr=LR, optimizer_name=OPT, device=device)

plt.figure()
plt.plot(out["train_losses"], label="train")
plt.plot(out["val_losses"], label="val")
plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title(f"Loss ({OPT}, lr={LR})")
plt.legend(); plt.show()
print("Validation accuracy:", round(out["val_accs"][-1], 4))



## 6. Optimizers (comparison)

In [None]:
settings = [
    ('sgd', 0.01, 30),
    ('sgd', 0.05, 30),
    ('adam', 0.001, 30),
    ('adam', 0.01, 30),
]

plt.figure()
for opt, lr, epochs in settings:
    model = MLP().to(device)
    out = train_model(model, train_loader, val_loader, epochs=epochs, lr=lr, optimizer_name=opt, device=device)
    plt.plot(out["val_losses"], label=f"{opt}, lr={lr}")
plt.xlabel("Epoch"); plt.ylabel("Val Loss"); plt.title("Val Loss across settings"); plt.legend(); plt.show()
