In [1]:
import os
import torch
from torch.utils.data import DataLoader, random_split, Subset
from torchvision import datasets, transforms
import numpy as np

def compute_fashion_mnist_mean_std(root="./data"):
    """
    Load raw FashionMNIST training data, convert to float in [0,1],
    and compute global mean and std over all pixels, as recommended in CS231n:
    center data to mean 0 and normalize its scale.
    """
    # Load once without transforms to access raw uint8 data
    raw_train = datasets.FashionMNIST(
        root=root,
        train=True,
        download=True,
        transform=None
    )

    # raw_train.data: shape [60000, 28, 28], dtype uint8 in [0, 255]
    train_data = raw_train.data.float() / 255.0 # match ToTensor scaling

    mean = train_data.mean().item()
    std = train_data.std().item()
    return mean, std


def get_fashion_mnist_datasets(root="./data", val_ratio=0.2, seed=551):
    """
    Acquire FashionMNIST, compute normalization statistics on training set,
    and return normalized train, validation, and test datasets.

    - Uses the default 28x28 version.
    - Uses the 60k official training split for train + validation.
    - Uses the 10k official test split as test.
    """
    mean, std = compute_fashion_mnist_mean_std(root)

    train_transform = transforms.Compose([
        transforms.ToTensor(), # [0, 255] -> [0, 1]
        transforms.Normalize((mean,), (std,)) # zero mean, unit-ish variance
    ])

    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])

    full_train_dataset = datasets.FashionMNIST(
        root=root,
        train=True,
        download=True,
        transform=train_transform
    )

    test_dataset = datasets.FashionMNIST(
        root=root,
        train=False,
        download=True,
        transform=test_transform
    )

    # Split 60k training samples into train and validation
    total_train = len(full_train_dataset) # should be 60000
    val_size = int(val_ratio * total_train)
    train_size = total_train - val_size

    generator = torch.Generator().manual_seed(seed)
    train_dataset, val_dataset = random_split(
        full_train_dataset,
        [train_size, val_size],
        generator=generator
    )

    return train_dataset, val_dataset, test_dataset, mean, std


def get_fashion_mnist_loaders(
    root="./data",
    val_ratio=0.2,
    batch_size=128,
    num_workers=2,
    seed=551
):
    """
    Convenience function that wraps dataset acquisition and returns
    DataLoaders for train, validation, and test sets.
    """
    train_dataset, val_dataset, test_dataset, mean, std = get_fashion_mnist_datasets(
        root=root,
        val_ratio=val_ratio,
        seed=seed
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    return train_loader, val_loader, test_loader, mean, std


if __name__ == "__main__":
    train_loader, val_loader, test_loader, mean, std = get_fashion_mnist_loaders()

    print(f"Train batches: {len(train_loader)}")
    print(f"Validation batches: {len(val_loader)}")
    print(f"Test batches: {len(test_loader)}")
    print(f"Computed mean: {mean:.4f}, std: {std:.4f}")

    # Inspect one batch shape
    images, labels = next(iter(train_loader))
    # images shape: [batch_size, 1, 28, 28]
    print(f"Batch image tensor shape: {images.shape}")
    print(f"Batch labels tensor shape: {labels.shape}")


100%|██████████| 26.4M/26.4M [00:01<00:00, 20.0MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 339kB/s]
100%|██████████| 4.42M/4.42M [00:00<00:00, 6.32MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 17.2MB/s]


Train batches: 375
Validation batches: 94
Test batches: 79
Computed mean: 0.2860, std: 0.3530




Batch image tensor shape: torch.Size([128, 1, 28, 28])
Batch labels tensor shape: torch.Size([128])


In [2]:
import numpy as np

def l2_loss(y, yh):
  return 0.5 * (yh - y)**2

def l2_loss_grad(y, yh):
  return yh - y

def cross_entropy(y, yh):
  return -np.sum(y * np.log(yh + 1e-12))

# note that this is true only for dL/dz, L = loss(softmax(z))
def cross_entropy_grad(y, yh):
  return yh - y

def relu(x):
  return np.maximum(0, x)
   
def relu_grad(x):
    return (x > 0).astype(float)

def leaky_relu(x, alpha=0.1):
   return np.maximum(alpha*x, x)

def leaky_relu_grad(x, alpha=0.1):
    grad = np.ones_like(x)
    grad[x < 0] = alpha
    return grad

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_grad(x):
    s = sigmoid(x)
    return s * (1 - s)
  
def tanh(x):
  return np.tanh(x)

def tanh_grad(x):
  t = np.tanh(x)
  return 1 - t * t

def linear(x):
  return x

def linear_grad(x):
  return np.ones_like(x)

def softmax(x):
  z = x - np.max(x)
  e = np.exp(z)
  return e / np.sum(e)

# MLP object:
# dims: list[int] - how many neurons in input, {hidden layers}, output
# activation_fns: list[relu|sigmoid|tanh|linear|softmax] - activation functions applied to EACH HIDDEN LAYER
# W: list[np.array(n_in x n_out)] - Random init with Normal dist.
# b: list[np.array(n_out)] - Random init with Normal dist.
# seed: float - For training reproducibility
class MLP:
  def __init__(self, dims, activation_fns, seed=None):
    self.dims = dims
    self.seed = seed
    if seed:
      np.random.seed(seed)

    dims_len = len(dims)
    activation_fns_len = len(activation_fns)
    if dims_len - 1 != activation_fns_len:
      dims_len = len(dims)
      raise RuntimeError(f"Length {dims_len} of dims does not match length {activation_fns_len} of activation_fns")
    
    self.activation_fns = activation_fns

    W_list = []
    b_list = []
    for i, dim in enumerate(dims[1:], start=1):
      W_list.append(np.random.normal(loc=0.0, scale=1.0, size=(dims[i - 1], dims[i])))
      b_list.append(np.zeros(dims[i]))
    
    self.W = W_list
    self.b = b_list

  def feed_forward(self, x):
    Z = []
    A = []
    z = None # intermediate var init
    a = np.array(x) # "input activation"
    # will contain list[x, Vx, Wf(Vx), ...]
    Z.append(x)
    # will contain list[x, fn(Vx), fn(Wf(Vx)), ...]
    A.append(x)
    for i, (W, b, fn) in enumerate(zip(self.W, self.b, self.activation_fns), start=1):
      z = a.T @ W + b
      Z.append(z)
      a = fn(z)
      A.append(a)
      z = a

    return Z, A

  def l1_penalty(self):
    """Sum of absolute values across a list of weight arrays."""
    total = 0.0
    for W in self.W:
        total += np.sum(np.abs(W))
    return total

  def l2_penalty(self):
      """Half the squared L2 norm across all weight arrays (0.5 * ||W||^2)."""
      total = 0.0
      for W in self.W:
          total += 0.5 * np.sum(W * W)
      return total
  
  def l1_grad(self, weights):
      """Gradient of L1 regularization: sign(W)"""
      grads = []
      for W in weights:
          grads.append(np.sign(W))
      return grads

  def l2_grad(self, weights):
      """Gradient of L2 regularization: W"""
      return weights  # Since d/dW (0.5 * W^2) = W
  

  def fit(self, X, Y, batch_size, lr, epochs, l1_lambda=0.0, l2_lambda=0.0):
    """
    - Supports SGD/minibatch/full-batch via batch_size.
    - Softmax+CE: δ_L = y_hat - y (no extra f').
    - MSE (or any other): δ_L = (y_hat - y) ⊙ f'(z_L).
    Assumes feed_forward(x) -> (Z, A) with:
      A[0] = x, Z[1..L], A[L] = y_hat
    """
    activation_to_grad_map = {
        relu: relu_grad,
        sigmoid: sigmoid_grad,
        tanh: tanh_grad,
        linear: linear_grad,
        leaky_relu: leaky_relu_grad,
    }
    
    N = X.shape[0]
    L = len(self.dims) - 1  # number of weight layers

    for epoch in range(epochs):
        print(f"Processing epoch {epoch + 1}...") if epoch % 10 == 0 else None
        # shuffle each epoch
        perm = np.random.permutation(N)
        Xs, Ys = X[perm], Y[perm]

        for start in range(0, N, batch_size):
            end = min(start + batch_size, N)
            m = end - start

            # gradient accumulators
            grad_W = [np.zeros_like(W) for W in self.W]
            grad_b = [np.zeros_like(b) for b in self.b]

            # per-sample backprop, sum then average
            for x, y in zip(Xs[start:end], Ys[start:end]):
                Z, A = self.feed_forward(x)  # A[0]=x, A[L]=ŷ; Z[1..L]
                last_act = self.activation_fns[-1]

                # delta per layer index: delta_layers[l] is deltal, for l=1..L. Means the array of deltas exactly match
                # layers (means 0th delta is None because that's for input.)
                delta_layers = [None] * (L + 1)

                # Output layer delta
                if last_act is softmax:
                    delta_layers[L] = A[-1] - y
                else:
                    dz_L = activation_to_grad_map[last_act](Z[-1])  # f'(z_L)
                    delta_layers[L] = (A[-1] - y) * dz_L

                # Hidden layers: l = L-1 .. 1
                for l in range(L - 1, 0, -1):
                    act = self.activation_fns[l - 1]           # activation after W[l-1]
                    dz = activation_to_grad_map[act](Z[l])      # f'(z_l)
                    
                    delta_layers[l] = (self.W[l] @ delta_layers[l + 1]) * dz

                # Gradients for each weight layer i=0..L-1 (maps layer i -> i+1)
                for i in range(L):
                    # ∂L/∂W[i] = A[i] (col) ⊗ δ^{i+1} (row)
                    grad_W[i] += np.outer(A[i], delta_layers[i + 1])
                    # ∂L/∂b[i] = δ^{i+1}
                    grad_b[i] += delta_layers[i + 1]

            # Apply averaged batch gradients
            inv_m = 1.0 / m
            for i in range(L):
                data_grad_W = grad_W[i] * inv_m
                data_grad_b = grad_b[i] * inv_m
                # Add regularization gradients
                if l1_lambda > 0:
                    data_grad_W += l1_lambda * np.sign(self.W[i])
                if l2_lambda > 0:
                    data_grad_W += l2_lambda * self.W[i]
                
                # Update weights
                self.W[i] -= lr * data_grad_W
                self.b[i] -= lr * data_grad_b
            

            if epoch % 10 == 0:
              avg_loss = self._compute_batch_loss(Xs[:100], Ys[:100], l1_lambda, l2_lambda)
              #print(f"  Loss (with reg): {avg_loss:.4f}")

                
  def _compute_batch_loss(self, X_batch, Y_batch, l1_lambda=0.0, l2_lambda=0.0):
      """Compute average loss for a batch including regularization"""
      total_loss = 0.0
      for x, y in zip(X_batch, Y_batch):
          total_loss += self.loss(x, y)
      
      avg_data_loss = total_loss / len(X_batch)
      
      # Add regularization terms
      reg_loss = 0.0
      if l1_lambda > 0:
          reg_loss += l1_lambda * self.l1_penalty()
      if l2_lambda > 0:
          reg_loss += l2_lambda * self.l2_penalty() * 2  # Multiply by 2 because we use 0.5 in penalty
      
      return avg_data_loss + reg_loss
  

  def predict(self, x):
     Z, A = self.feed_forward(x)
     return np.argmax(A[-1]) # for softmax
  
  
  def loss(self, x, y, reg=None):
    last_act = self.activation_fns[-1]
    Z, A = self.feed_forward(x)
    
    # CE loss
    if last_act is softmax:
      loss = cross_entropy(y, A[-1])
    # L2 loss
    else:
      loss = np.sum(l2_loss(y, A[-1])) 
    return loss
  
  def evaluate_acc(self, X, y):
        """Calculate accuracy on dataset"""
        correct = 0
        for i in range(len(X)):
            pred = self.predict(X[i])  # Use the predict method
            if pred == y[i]:
                correct += 1
        return correct / len(X)
     

mlp = MLP([5,10,10,3], [sigmoid, linear, softmax])
mlp.fit(np.array([np.array([1,2,3,4,5])]), np.array(np.array([1,1,1])), 1, 0.001, 100)




Processing epoch 1...
Processing epoch 11...
Processing epoch 21...
Processing epoch 31...
Processing epoch 41...
Processing epoch 51...
Processing epoch 61...
Processing epoch 71...
Processing epoch 81...
Processing epoch 91...


In [4]:
# 3.1

def convert_loader_to_np(loader):
    all_images = []
    all_labels = []

    for images, labels in loader:
        images_flat = images.view(images.size(0), -1).numpy()
        labels_np = labels.numpy()
        all_images.append(images_flat)
        all_labels.append(labels_np)

    X = np.concatenate(all_images, axis=0)
    y = np.concatenate(all_labels, axis=0)

    return X, y

def labels_to_onehot(y, num_classes=10):
    """
    Convert integer labels to one-hot encoding
    """
    onehot = np.zeros((len(y), num_classes))
    onehot[np.arange(len(y)), y] = 1
    return onehot

train_loader, val_loader, test_loader, mean, std = get_fashion_mnist_loaders(
        batch_size=128, 
        num_workers=2
    )

X_train, y_train = convert_loader_to_np(train_loader)
y_train_onehot = labels_to_onehot(y_train)

X_val, y_val = convert_loader_to_np(val_loader)
y_val_onehot = labels_to_onehot(y_val)

X_test, y_test = convert_loader_to_np(test_loader)
y_test_onehot = labels_to_onehot(y_test)

mlp_no_layers = MLP([784, 10], [softmax])
mlp_no_layers.fit(X_train, y_train_onehot, 64, 0.001, 10)

train_acc1 = mlp_no_layers.evaluate_acc(X_train, y_train)
test_acc1 = mlp_no_layers.evaluate_acc(X_test, y_test)
print(f"Model 1 - Train Accuracy: {train_acc1:.4f}, Test Accuracy: {test_acc1:.4f}")

mlp_1_layer = MLP([784, 256, 10], [relu, softmax])
mlp_2_layer = MLP([784, 256, 256, 10], [relu, relu, softmax])

mlp_1_layer.fit(X_train, y_train_onehot, 64, 0.0001, 10)
train_acc2 = mlp_1_layer.evaluate_acc(X_train, y_train)
test_acc2 = mlp_1_layer.evaluate_acc(X_test, y_test)

print(f"Model 2 - Train Accuracy: {train_acc2:.4f}, Test Accuracy: {test_acc2:.4f}")

mlp_2_layer.fit(X_train, y_train_onehot, 64, 0.0001, 10)
train_acc3 = mlp_2_layer.evaluate_acc(X_train, y_train)
test_acc3 = mlp_2_layer.evaluate_acc(X_test, y_test)
print(f"Model 3 - Train Accuracy: {train_acc3:.4f}, Test Accuracy: {test_acc3:.4f}")



Processing epoch 1...
Model 1 - Train Accuracy: 0.6255, Test Accuracy: 0.6173
Processing epoch 1...
Model 2 - Train Accuracy: 0.6538, Test Accuracy: 0.6492
Processing epoch 1...
Model 3 - Train Accuracy: 0.8033, Test Accuracy: 0.7818


In [5]:
#3.2

last_model_redo1 = MLP([784, 256, 256, 10], [tanh, tanh, softmax])
last_model_redo2 = MLP([784, 256, 256, 10], [leaky_relu, leaky_relu, softmax])
last_model_redo1.fit(X_train, y_train_onehot, 64, 0.001, 10)
last_model_redo2.fit(X_train, y_train_onehot, 64, 0.001, 10)
test_acc1 = last_model_redo1.evaluate_acc(X_test, y_test)
test_acc2 = last_model_redo2.evaluate_acc(X_test, y_test)
print(f"New model with tanh - Test accuracy: {test_acc1:.4f}")
print(f"New model with leaky ReLU - Test accuracy: {test_acc2:.4f}")

Processing epoch 1...
Processing epoch 1...
New model with tanh - Test accuracy: 0.5598
New model with leaky ReLU - Test accuracy: 0.8092


3.2 Analysis:
With a low learning rate (0.0001), the new model performed worse than the original (accuracy 0.589). This is likely because the gradients from tanh already cause a diminished shift in the weights when learning.
We increase the learning rate to 0.001 to verify our assumption that learning rate should be increased when using a network with sigmoid or tanh, since their gradients are very small.

Our intuition is confirmed -> with a higher learning rate of 0.001, our test accuracy jumps to 0.7294!




In [11]:
# 3.3

model1 = MLP([784, 256, 256, 10], [relu, relu, softmax])
model1.fit(X_train, y_train_onehot, 64, 0.0001, 10, l1_lambda=0.001)
test_acc = model1.evaluate_acc(X_test, y_test)
print(f"L1 - Test accuracy: {test_acc:.4f}")

model2 = MLP([784, 256, 256, 10], [relu, relu, softmax])
model2.fit(X_train, y_train_onehot, 64, 0.0001, 10, l2_lambda=0.001)
test_acc = model2.evaluate_acc(X_test, y_test)
print(f"L2 - Test accuracy: {test_acc:.4f}")


Processing epoch 1...
L1 - Test accuracy: 0.7700
Processing epoch 1...
L2 - Test accuracy: 0.7693


3.3 analysis
With regularization, our model performed similarly to without. This is likely because our model does not overfit.
If, on the other hand, our model performed better on the test set with regularization, that would indicate that our model without regularization overfitted to the training data.
Conclusion: Regularization decreases model accuracy very slightly IF it does not overfit. However, if it does overfit, it can increase test accuracy greatly.

In [12]:
#3.4

# 3.4 - Train on unnormalized images
import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, random_split

def get_unnormalized_fashion_mnist_loaders(batch_size=128, val_ratio=0.2, seed=551):
    """
    Load FashionMNIST without normalization for question 3.4
    """
    # Simple transform that only converts to tensor (no normalization)
    basic_transform = transforms.Compose([
        transforms.ToTensor()  # Only converts to [0,1], no normalization
    ])
    
    full_train_dataset = datasets.FashionMNIST(
        root="./data",
        train=True,
        download=True,
        transform=basic_transform
    )

    test_dataset = datasets.FashionMNIST(
        root="./data",
        train=False,
        download=True,
        transform=basic_transform
    )

    # Split train into train/val
    total_train = len(full_train_dataset)
    val_size = int(val_ratio * total_train)
    train_size = total_train - val_size

    generator = torch.Generator().manual_seed(seed)
    train_dataset, val_dataset = random_split(
        full_train_dataset,
        [train_size, val_size],
        generator=generator
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )

    return train_loader, val_loader, test_loader

#unnormalized data
train_loader_unnorm, val_loader_unnorm, test_loader_unnorm = get_unnormalized_fashion_mnist_loaders()

# Convert to numpy arrays
X_train_unnorm, y_train_unnorm = convert_loader_to_np(train_loader_unnorm)
y_train_onehot_unnorm = labels_to_onehot(y_train_unnorm)

X_test_unreg, y_test_unreg = convert_loader_to_np(test_loader_unnorm)

# Train model on unnormalized data
model_unnorm = MLP([784, 256, 256, 10], [relu, relu, softmax])
model_unnorm.fit(X_train_unnorm, y_train_onehot_unnorm, 64, 0.0001, 10)

# Evaluate
train_acc_unnorm = model_unnorm.evaluate_acc(X_train_unnorm, y_train_unnorm)
test_acc_unnorm = model_unnorm.evaluate_acc(X_test_unreg, y_test_unreg)

print(f"Unnormalized Model - Test Accuracy: {test_acc_unnorm:.4f}")

# Compare with normalized model from 3.1
print(f"Normalized Model (from 3.1) - Test Accuracy: {test_acc3:.4f}")

Processing epoch 1...
Unnormalized Model - Test Accuracy: 0.7476
Normalized Model (from 3.1) - Test Accuracy: 0.7764


3.4 Analysis:
the unnormalized model performed worse than the normalized one. This is expected as normalization of data improves the dataset overall, which improves the training quality.


In [13]:
# 3.5 - Data Augmentation

import time

def get_augmented_fashion_mnist_loaders(batch_size=128, val_ratio=0.2, seed=551, num_workers=0):
    mean, std = compute_fashion_mnist_mean_std()

    train_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(10),
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])

    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])

    # dataset only for splitting
    base_train = datasets.FashionMNIST("./data", train=True, download=True, transform=None)

    total_train = len(base_train)
    val_size = int(val_ratio * total_train)
    train_size = total_train - val_size

    generator = torch.Generator().manual_seed(seed)
    train_indices, val_indices = random_split(range(total_train), [train_size, val_size], generator=generator)

    # now create two datasets with different transforms
    train_full = datasets.FashionMNIST("./data", train=True, download=True, transform=train_transform)
    val_full   = datasets.FashionMNIST("./data", train=True, download=True, transform=test_transform)
    test_full  = datasets.FashionMNIST("./data", train=False, download=True, transform=test_transform)

    train_dataset = Subset(train_full, train_indices.indices)
    val_dataset   = Subset(val_full,   val_indices.indices)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  num_workers=num_workers)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, num_workers=num_workers)
    test_loader  = DataLoader(test_full,     batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return train_loader, val_loader, test_loader


# Load augmented data
train_loader_aug, val_loader_aug, test_loader_aug = get_augmented_fashion_mnist_loaders()

# Convert to numpy arrays
X_train_aug, y_train_aug = convert_loader_to_np(train_loader_aug)
y_train_onehot_aug = labels_to_onehot(y_train_aug)

X_test_aug, y_test_aug = convert_loader_to_np(test_loader_aug)

# Train model on augmented data (using same architecture as 3.1)
model_aug = MLP([784, 256, 256, 10], [relu, relu, softmax])

start_time_35 = time.time()
model_aug.fit(X_train_aug, y_train_onehot_aug, 64, 0.0001, 10, l2_lambda=0.001)
training_time_35 = time.time() - start_time_35

# Evaluate
test_acc_aug = model_aug.evaluate_acc(X_test_aug, y_test_aug)

print(f"Augmented Model - Test Accuracy: {test_acc_aug:.4f}")
print(f"Question 3 Model (from 3.3) - Test Accuracy: {test_acc:.4f}")
print(f"3.5 MLP Training Time: {training_time_35:.2f}s")


Processing epoch 1...
Augmented Model - Test Accuracy: 0.6873
Question 3 Model (from 3.3) - Test Accuracy: 0.7693
3.5 MLP Training Time: 403.28s


3.5: BROKEN, AUGMENTED DATA DOES WORSE??

In [14]:
# 3.6 - CNN Implementation with PyTorch

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

class FashionMNISTCNN(nn.Module):
    def __init__(self):
        super(FashionMNISTCNN, self).__init__()
        #layer1
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)  #1 in, 32out
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1) #32 in, 64 out
        
        #full connected layer
        self.fc1 = nn.Linear(64 * 7 * 7, 256)  #after 2 maxpool layers: 28x28 -> 14x14 -> 7x7
        self.fc2 = nn.Linear(256, 10)  #output layer for 10 classes
        
        #introduce dropout
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        #first conv block: Conv -> ReLU -> MaxPool
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)  # 28x28 -> 14x14 (downsample: https://docs.pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html)
        
        #second conv block: Conv -> ReLU -> MaxPool
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)  # 14x14 -> 7x7
        
        #flatten fully connected layers
        x = x.view(-1, 64 * 7 * 7)
        
        # Fully connected layers with ReLU
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

    def train_model(self, train_loader, val_loader, epochs=10, lr=0.001):
        """
        Train the CNN model
        """
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        
        train_losses = []
        val_accuracies = []
        
        for epoch in range(epochs):
            #nn.Module train() method
            self.train()  #sets model to training mode (enables dropout, etc.)
            running_loss = 0.0
            
            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(device), target.to(device)
                
                optimizer.zero_grad()
                output = self(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
            
            # validation phase
            self.eval()  #set model to evaluation mode (disables dropout, etc.)
            correct = 0
            total = 0
            
            with torch.no_grad():
                for data, target in val_loader:
                    data, target = data.to(device), target.to(device)
                    output = self(data)
                    _, predicted = torch.max(output.data, 1)
                    total += target.size(0)
                    correct += (predicted == target).sum().item()
            
            val_accuracy = 100 * correct / total
            avg_loss = running_loss / len(train_loader)
            
            train_losses.append(avg_loss)
            val_accuracies.append(val_accuracy)
            
            print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')
        
        return train_losses, val_accuracies

    def evaluate(self, test_loader):
        """
        Evaluate the CNN model on test set
        """
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)
        
        self.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = self(data)
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
        
        accuracy = 100 * correct / total
        return accuracy

#data loaders
train_loader, val_loader, test_loader, mean, std = get_fashion_mnist_loaders(batch_size=128)

#create/train cnn
cnn_model = FashionMNISTCNN()
print("Training CNN...")
train_losses, val_accuracies = cnn_model.train_model(train_loader, val_loader, epochs=10)

#evaluate
test_accuracy = cnn_model.evaluate(test_loader)
print(f"CNN Test Accuracy: {test_accuracy:.2f}%")

#compare with mlp
print(f"Best MLP Test Accuracy (from 3.1): {test_acc3:.4f} ({test_acc3*100:.2f}%)")
print(f"CNN Test Accuracy: {test_accuracy:.2f}%")

#analysis
if test_accuracy/100 > test_acc3:
    improvement = ((test_accuracy/100 - test_acc3) / test_acc3) * 100
    print(f"CNN improved accuracy by {improvement:.2f}% compared to MLP")
else:
    difference = ((test_acc3 - test_accuracy/100) / test_acc3) * 100
    print(f"MLP was {difference:.2f}% better than CNN")


Training CNN...
Epoch 1/10, Loss: 0.5408, Val Accuracy: 87.41%
Epoch 2/10, Loss: 0.3484, Val Accuracy: 89.17%
Epoch 3/10, Loss: 0.2941, Val Accuracy: 90.12%
Epoch 4/10, Loss: 0.2646, Val Accuracy: 91.35%
Epoch 5/10, Loss: 0.2401, Val Accuracy: 91.71%
Epoch 6/10, Loss: 0.2163, Val Accuracy: 92.08%
Epoch 7/10, Loss: 0.2010, Val Accuracy: 92.40%
Epoch 8/10, Loss: 0.1835, Val Accuracy: 92.04%
Epoch 9/10, Loss: 0.1703, Val Accuracy: 92.08%
Epoch 10/10, Loss: 0.1555, Val Accuracy: 92.51%
CNN Test Accuracy: 91.96%
Best MLP Test Accuracy (from 3.1): 0.7764 (77.64%)
CNN Test Accuracy: 91.96%
CNN improved accuracy by 18.44% compared to MLP


In [15]:
#3.7: Train CNN with Data Augmentation

import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
import torch.nn.functional as F
from torchvision import transforms, datasets
from torch.utils.data import random_split

class FashionMNISTCNN(nn.Module):
    def __init__(self):
        super(FashionMNISTCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 256)
        self.fc2 = nn.Linear(256, 10)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = x.view(-1, 64 * 7 * 7)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

    def train_model(self, train_loader, val_loader, epochs=10, lr=0.001):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        
        train_losses = []
        val_accuracies = []
        
        for epoch in range(epochs):
            start_time = time.time()
            
            #training phase
            self.train()
            running_loss = 0.0
            
            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(device), target.to(device)
                
                optimizer.zero_grad()
                output = self(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
            
            #validation phase
            self.eval()
            correct = 0
            total = 0
            
            with torch.no_grad():
                for data, target in val_loader:
                    data, target = data.to(device), target.to(device)
                    output = self(data)
                    _, predicted = torch.max(output.data, 1)
                    total += target.size(0)
                    correct += (predicted == target).sum().item()
            
            epoch_time = time.time() - start_time
            val_accuracy = 100 * correct / total
            avg_loss = running_loss / len(train_loader)
            
            train_losses.append(avg_loss)
            val_accuracies.append(val_accuracy)
            
            print(f'Epoch {epoch+1}/{epochs}, Time: {epoch_time:.2f}s, Loss: {avg_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')
        
        return train_losses, val_accuracies

    def evaluate(self, test_loader):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)
        
        self.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = self(data)
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
        
        accuracy = 100 * correct / total
        return accuracy

def get_augmented_fashion_mnist_loaders(batch_size=128, val_ratio=0.2, seed=551, num_workers=0):
    mean, std = compute_fashion_mnist_mean_std()

    train_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(10),
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])

    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])

    #dataset only for splitting
    base_train = datasets.FashionMNIST("./data", train=True, download=True, transform=None)

    total_train = len(base_train)
    val_size = int(val_ratio * total_train)
    train_size = total_train - val_size

    generator = torch.Generator().manual_seed(seed)
    train_indices, val_indices = random_split(range(total_train), [train_size, val_size], generator=generator)

    #now create two datasets with different transforms
    train_full = datasets.FashionMNIST("./data", train=True, download=True, transform=train_transform)
    val_full   = datasets.FashionMNIST("./data", train=True, download=True, transform=test_transform)
    test_full  = datasets.FashionMNIST("./data", train=False, download=True, transform=test_transform)

    train_dataset = Subset(train_full, train_indices.indices)
    val_dataset   = Subset(val_full,   val_indices.indices)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  num_workers=num_workers)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, num_workers=num_workers)
    test_loader  = DataLoader(test_full,     batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return train_loader, val_loader, test_loader

#without augmentation
print("no augmentation")
train_loader_regular, val_loader_regular, test_loader_regular, _, _ = get_fashion_mnist_loaders(batch_size=128)

cnn_regular = FashionMNISTCNN()
start_time_regular = time.time()
train_losses_regular, val_accuracies_regular = cnn_regular.train_model(
    train_loader_regular, val_loader_regular, epochs=10
)
training_time_regular = time.time() - start_time_regular

test_accuracy_regular = cnn_regular.evaluate(test_loader_regular)
print(f"Regular CNN - Test Accuracy: {test_accuracy_regular:.2f}%")
print(f"Regular CNN - Training Time: {training_time_regular:.2f}s")

# with augmentation
train_loader_aug, val_loader_aug, test_loader_aug = get_augmented_fashion_mnist_loaders(batch_size=128)

cnn_augmented = FashionMNISTCNN()
start_time_aug = time.time()
train_losses_aug, val_accuracies_aug = cnn_augmented.train_model(
    train_loader_aug, val_loader_aug, epochs=10
)
training_time_aug = time.time() - start_time_aug

test_accuracy_aug = cnn_augmented.evaluate(test_loader_aug)
print(f"Augmented CNN: Test Accuracy: {test_accuracy_aug:.2f}%")
print(f"Augmented CNN: Training Time: {training_time_aug:.2f}s")

# Performance Comparison
print("\nPERFORMANCE COMPARISON")
print(f"Regular CNN - Test Accuracy: {test_accuracy_regular:.2f}%")
print(f"Augmented CNN - Test Accuracy: {test_accuracy_aug:.2f}%")
print(f"Regular CNN - Training Time: {training_time_regular:.2f}s")
print(f"Augmented CNN - Training Time: {training_time_aug:.2f}s")

accuracy_difference = test_accuracy_aug - test_accuracy_regular
time_difference = training_time_aug - training_time_regular

print(f"\nAccuracy Difference: {accuracy_difference:+.2f}%")
print(f"Training Time Difference: {time_difference:+.2f}s")

if accuracy_difference > 0:
    print("Data augmentation IMPROVED accuracy")
else:
    print("Data augmentation REDUCED accuracy")

if time_difference > 0:
    print("Data augmentation INCREASED training time")
else:
    print("Data augmentation DECREASED training time")

#additional analysis
print("\nADDITIONAL ANALYSIS")
print("Regular CNN Final Validation Accuracies:")
for i, acc in enumerate(val_accuracies_regular[-3:], len(val_accuracies_regular)-2):
    print(f"  Epoch {i+1}: {acc:.2f}%")

print("Augmented CNN Final Validation Accuracies:")
for i, acc in enumerate(val_accuracies_aug[-3:], len(val_accuracies_aug)-2):
    print(f"  Epoch {i+1}: {acc:.2f}%")

no augmentation
Epoch 1/10, Time: 13.88s, Loss: 0.5270, Val Accuracy: 88.08%
Epoch 2/10, Time: 12.50s, Loss: 0.3355, Val Accuracy: 89.77%
Epoch 3/10, Time: 12.63s, Loss: 0.2870, Val Accuracy: 90.07%
Epoch 4/10, Time: 12.82s, Loss: 0.2564, Val Accuracy: 91.27%
Epoch 5/10, Time: 12.54s, Loss: 0.2313, Val Accuracy: 91.95%
Epoch 6/10, Time: 13.10s, Loss: 0.2113, Val Accuracy: 91.85%
Epoch 7/10, Time: 12.70s, Loss: 0.1952, Val Accuracy: 91.78%
Epoch 8/10, Time: 12.49s, Loss: 0.1770, Val Accuracy: 92.17%
Epoch 9/10, Time: 12.43s, Loss: 0.1655, Val Accuracy: 92.19%
Epoch 10/10, Time: 12.38s, Loss: 0.1499, Val Accuracy: 92.62%
Regular CNN - Test Accuracy: 92.19%
Regular CNN - Training Time: 127.47s
Epoch 1/10, Time: 12.61s, Loss: 0.7838, Val Accuracy: 82.41%
Epoch 2/10, Time: 13.51s, Loss: 0.5492, Val Accuracy: 84.81%
Epoch 3/10, Time: 11.99s, Loss: 0.4810, Val Accuracy: 86.96%
Epoch 4/10, Time: 11.98s, Loss: 0.4397, Val Accuracy: 88.20%
Epoch 5/10, Time: 11.80s, Loss: 0.4169, Val Accuracy: 87

In [16]:
# 3.8 - Pretrained CNN with frozen convolutional layers and trainable fully connected head

import time
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader, random_split, Subset

class PretrainedResNetFashion(nn.Module):
    def __init__(self, fc_hidden_layers=None, num_classes=10):
        """
        Wrap a ResNet18 pretrained on ImageNet.
        Freeze convolutional layers and replace the final fully connected layer
        with a new head defined by fc_hidden_layers.

        Examples for fc_hidden_layers:
          []          -> 512 -> num_classes
          [256]       -> 512 -> 256 -> num_classes
          [256, 256]  -> 512 -> 256 -> 256 -> num_classes
        """
        super(PretrainedResNetFashion, self).__init__()
        if fc_hidden_layers is None:
            fc_hidden_layers = []

        # Load pretrained ResNet18 (handle both newer and older torchvision APIs)
        try:
            resnet = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        except AttributeError:
            resnet = models.resnet18(pretrained=True)

        # Freeze all pretrained parameters (convolutional backbone)
        for param in resnet.parameters():
            param.requires_grad = False

        in_features = resnet.fc.in_features

        head_layers = []
        prev_dim = in_features
        for hidden_dim in fc_hidden_layers:
            head_layers.append(nn.Linear(prev_dim, hidden_dim))
            head_layers.append(nn.ReLU())
            head_layers.append(nn.Dropout(0.5))
            prev_dim = hidden_dim
        head_layers.append(nn.Linear(prev_dim, num_classes))

        resnet.fc = nn.Sequential(*head_layers)
        self.model = resnet

    def forward(self, x):
        return self.model(x)


def get_pretrained_aug_fashion_mnist_loaders(batch_size=128, val_ratio=0.2, seed=551, num_workers=0):
    """
    Data loaders for pretrained ResNet on FashionMNIST with data augmentation
    consistent with question 3.5.

    - Augmentation (train only): random horizontal flip, rotation, translation
    - Convert 1-channel FashionMNIST images to 3-channel by duplication
    - Normalize with dataset mean/std expanded to 3 channels
    """
    mean, std = compute_fashion_mnist_mean_std()
    mean_3ch = (mean, mean, mean)
    std_3ch = (std, std, std)

    train_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(10),
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        transforms.ToTensor(),
        transforms.Lambda(lambda t: t.expand(3, -1, -1)),
        transforms.Normalize(mean_3ch, std_3ch)
    ])

    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Lambda(lambda t: t.expand(3, -1, -1)),
        transforms.Normalize(mean_3ch, std_3ch)
    ])

    # Base dataset only for defining a fixed train/val split
    base_train = datasets.FashionMNIST("./data", train=True, download=True, transform=None)

    total_train = len(base_train)
    val_size = int(val_ratio * total_train)
    train_size = total_train - val_size

    generator = torch.Generator().manual_seed(seed)
    train_indices, val_indices = random_split(range(total_train), [train_size, val_size], generator=generator)

    # Datasets with actual transforms
    train_full = datasets.FashionMNIST("./data", train=True, download=True, transform=train_transform)
    val_full   = datasets.FashionMNIST("./data", train=True, download=True, transform=test_transform)
    test_full  = datasets.FashionMNIST("./data", train=False, download=True, transform=test_transform)

    train_dataset = Subset(train_full, train_indices.indices)
    val_dataset   = Subset(val_full,   val_indices.indices)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  num_workers=num_workers)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, num_workers=num_workers)
    test_loader  = DataLoader(test_full,     batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return train_loader, val_loader, test_loader


def train_pretrained_model(model, train_loader, val_loader, epochs=5, lr=1e-3):
    """
    Train only the fully connected head of a pretrained model.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

    train_losses = []
    val_accuracies = []

    for epoch in range(epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        avg_loss = running_loss / len(train_loader)
        train_acc = 100.0 * correct / total

        # Validation phase
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_acc = 100.0 * correct / total

        train_losses.append(avg_loss)
        val_accuracies.append(val_acc)

        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, "
              f"Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%")

    return train_losses, val_accuracies


def evaluate_pretrained(model, test_loader):
    """
    Evaluate pretrained model on the test set and return accuracy in percent.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100.0 * correct / total
    return accuracy


# Run experiments with different FC head depths to justify the choice

train_loader_pre, val_loader_pre, test_loader_pre = get_pretrained_aug_fashion_mnist_loaders(batch_size=128)

fc_head_configs = {
    "fc_512_10": [],
    "fc_512_256_10": [256],
    "fc_512_256_256_10": [256, 256]
}

pretrained_results = {}

for name, fc_layers in fc_head_configs.items():
    print(f"\nTraining pretrained ResNet18 with head configuration: {name}, layers: {fc_layers}")
    model_pre = PretrainedResNetFashion(fc_hidden_layers=fc_layers, num_classes=10)

    start_time = time.time()
    train_losses_pre, val_accs_pre = train_pretrained_model(
        model_pre, train_loader_pre, val_loader_pre, epochs=10, lr=1e-3
    )
    train_time = time.time() - start_time

    test_acc_pre = evaluate_pretrained(model_pre, test_loader_pre)
    best_val_acc = max(val_accs_pre) if len(val_accs_pre) > 0 else 0.0

    pretrained_results[name] = {
        "fc_layers": fc_layers,
        "best_val_acc": best_val_acc,
        "test_acc": test_acc_pre,
        "train_time": train_time
    }

    print(f"Head {name} - Best Val Acc: {best_val_acc:.2f}%, "
          f"Test Acc: {test_acc_pre:.2f}%, Training Time: {train_time:.2f}s")

# Select best head by validation accuracy
best_head_name = max(pretrained_results, key=lambda k: pretrained_results[k]["best_val_acc"])
best_head_info = pretrained_results[best_head_name]

print("\nBest pretrained head configuration (by validation accuracy):")
print(best_head_name, best_head_info)

print(f"\nBest pretrained model (3.8) - Test Accuracy: {best_head_info['test_acc']:.2f}%")
print(f"Best pretrained model (3.8) - Training Time: {best_head_info['train_time']:.2f}s")

# 3.5 MLP WITH AUGMENTATION
if 'test_acc_aug' in globals() and 'training_time_35' in globals():
    print(f"\nBest MLP with augmentation (3.5) - Test Accuracy: {test_acc_aug * 100:.2f}%")
    print(f"Best MLP with augmentation (3.5) - Training Time: {training_time_35:.2f}s")

# 3.7 CNN WITH AUGMENTATION
if 'test_accuracy_aug' in globals() and 'training_time_aug' in globals():
    print(f"\nCNN with augmentation (3.7) - Test Accuracy: {test_accuracy_aug:.2f}%")
    print(f"CNN with augmentation (3.7) - Training Time: {training_time_aug:.2f}s")



Training pretrained ResNet18 with head configuration: fc_512_10, layers: []
Epoch 1/10, Loss: 1.3405, Train Acc: 53.73%, Val Acc: 64.08%
Epoch 2/10, Loss: 1.1236, Train Acc: 60.74%, Val Acc: 66.34%
Epoch 3/10, Loss: 1.0827, Train Acc: 62.04%, Val Acc: 65.58%
Epoch 4/10, Loss: 1.0599, Train Acc: 62.92%, Val Acc: 67.11%
Epoch 5/10, Loss: 1.0574, Train Acc: 62.74%, Val Acc: 67.76%
Epoch 6/10, Loss: 1.0434, Train Acc: 63.09%, Val Acc: 67.59%
Epoch 7/10, Loss: 1.0416, Train Acc: 63.35%, Val Acc: 66.78%
Epoch 8/10, Loss: 1.0316, Train Acc: 63.83%, Val Acc: 68.26%
Epoch 9/10, Loss: 1.0346, Train Acc: 63.23%, Val Acc: 68.16%
Epoch 10/10, Loss: 1.0418, Train Acc: 63.19%, Val Acc: 68.05%
Head fc_512_10 - Best Val Acc: 68.26%, Test Acc: 67.65%, Training Time: 138.74s

Training pretrained ResNet18 with head configuration: fc_512_256_10, layers: [256]
Epoch 1/10, Loss: 1.3268, Train Acc: 52.70%, Val Acc: 65.40%
Epoch 2/10, Loss: 1.1304, Train Acc: 59.02%, Val Acc: 67.28%
Epoch 3/10, Loss: 1.0769, 