# Day 5
## Pytorch
* Build an MLP in Pytorch
* Train MNIST/Fashion-MNIST (CPU)
* Add weight decay, run with SGD vs Adam, add LR scheduler. 

### Check: 
* MNIST: test accuracy > 97% 
* MLP < 10 epochs (Adam helps)

### Interview drill 
Differences between weight decay and L2 in Adam (decoupled vs classical)

In [1]:
import os
import random

import torch
import torch.nn.functional as F  # noqa: N812
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms


In [3]:
# ---- Configuration ----
DATASET = datasets.FashionMNIST
BATCH_SIZE = 256
EPOCHS = 10
LR = 2e-3
WD = 1e-4
H = 42
COSINE_LR = True # Use cosine learning rate schedule
SEED = 42
NUM_WORKERS = 2
DEVICE = "mps" if torch.mps.is_available() else "cpu"

print(f"[DEVICE] Using device: {DEVICE}")


[DEVICE] Using device: mps


## Fashion MNIST data preprocessing

**Fashion MNIST** stores pixel values as unsigned 8-bit integers in the range 0...255. Dividing by 255 converts this to floats in $[0,1]$. Necessary before mean/std normalization. 

In [6]:
import pandas as pd

raw_train = datasets.FashionMNIST(root="./data", train=True,  download=True)
data = raw_train.data.float().div_(255)
mean = data.mean().item()
std = data.std().item()

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((mean,), (std,)),
])

# --- Reload datasets with transforms (standard) ---
train_ds = datasets.FashionMNIST(root="./data", train=True, download=True, transform=transform)
test_ds  = datasets.FashionMNIST(root="./data", train=False, download=True, transform=transform)

# --- Data loaders ---
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)


print(f"[Data] Mean: {mean}, std: {std}")
print("[Data] Train size:", len(train_ds))
print("[Data] Test size:", len(test_ds))

[Data] Mean: 0.28604060411453247, std: 0.3530242443084717


## MLP Implementation

*view* - reshapes tensors without copying memory when possible. Pass the new shape, -1: infer dimension automatically. 

In [None]:
from nets_numpy import MLP

model = MLP(droupout_probability=0.1,
            use_batchnorm=False).to(device=DEVICE)

In [12]:
import torch.optim as optim
from utils import train_epoch, evaluate

# ---- Optimizer/Loss ---- 
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS) if COSINE_LR else None

# --- Training loop ---
best_accuracy = 0.0
epochs = 10

for epoch in range(1, epochs+1):
    train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, loss)
    test_loss, test_accuracy = evaluate(model, test_loader, loss)

    if scheduler: scheduler.step()

    print(f"[Training] Epoch {epoch:2d}/{epochs} | "
          f"Train loss: {train_loss:.4f}, accuracy: {train_accuracy:.4f} | "
          f"Test loss: {test_loss:.4f}, accuracy: {test_accuracy:.4f} | "
          f"Learning rate: {optimizer.param_groups[0]['lr']:.6f}")

    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), "fashion_mnist_mlp_best.pth")
        print(f"[Model] New best model saved with accuracy: {best_accuracy:.4f}")

print(f"[Model] Best accuracy: {best_accuracy:.4f}")

[Training] Epoch  1/10 | Train loss: 0.2701, accuracy: 0.8987 | Test loss: 0.3326, accuracy: 0.8849 | Learning rate: 0.001951
[Model] New best model saved with accuracy: 0.8849
[Training] Epoch  2/10 | Train loss: 0.2536, accuracy: 0.9070 | Test loss: 0.3476, accuracy: 0.8771 | Learning rate: 0.001809
[Training] Epoch  3/10 | Train loss: 0.2448, accuracy: 0.9078 | Test loss: 0.3247, accuracy: 0.8853 | Learning rate: 0.001588
[Model] New best model saved with accuracy: 0.8853
[Training] Epoch  4/10 | Train loss: 0.2247, accuracy: 0.9149 | Test loss: 0.3235, accuracy: 0.8870 | Learning rate: 0.001309
[Model] New best model saved with accuracy: 0.8870
[Training] Epoch  5/10 | Train loss: 0.2086, accuracy: 0.9208 | Test loss: 0.3148, accuracy: 0.8909 | Learning rate: 0.001000
[Model] New best model saved with accuracy: 0.8909
[Training] Epoch  6/10 | Train loss: 0.1916, accuracy: 0.9260 | Test loss: 0.3096, accuracy: 0.8925 | Learning rate: 0.000691
[Model] New best model saved with accura

In [13]:
# Recompute once for final confusion matrix/readout
_, test_acc, c, per_class_acc = evaluate(model, test_loader, loss, compute_confmat=True)
print(f"Final test acc: {test_acc*100:.2f}%")

# Pretty-print per-class accuracy
print("\nPer-class accuracy:")
CLASS_NAMES = ["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"]
for i, acc in enumerate(per_class_acc):
    print(f"  {i:>2} ({CLASS_NAMES[i]:12s}): {acc*100:5.2f}%")

print("\nConfusion matrix (rows=true, cols=pred):")
print(c.numpy())

Final test acc: 90.06%

Per-class accuracy:
   0 (T-shirt/top ): 86.40%
   1 (Trouser     ): 98.00%
   2 (Pullover    ): 82.40%
   3 (Dress       ): 90.30%
   4 (Coat        ): 85.40%
   5 (Sandal      ): 96.60%
   6 (Shirt       ): 71.10%
   7 (Sneaker     ): 96.90%
   8 (Bag         ): 97.30%
   9 (Ankle boot  ): 96.20%

Confusion matrix (rows=true, cols=pred):
[[864   2  10  15   5   2  97   0   5   0]
 [  3 980   1  12   2   0   2   0   0   0]
 [ 20   1 824  10  77   0  67   0   1   0]
 [ 15   8  14 903  33   1  22   0   4   0]
 [  1   0  72  28 854   0  44   0   1   0]
 [  0   0   0   0   0 966   0  17   0  17]
 [111   0  80  25  67   0 711   0   6   0]
 [  0   0   0   0   0  11   0 969   1  19]
 [  5   0   1   5   1   4   5   5 973   1]
 [  0   1   0   0   0   4   1  32   0 962]]
