In [3]:
import os
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import numpy as np

def compute_fashion_mnist_mean_std(root="./data"):
    """
    Load raw FashionMNIST training data, convert to float in [0,1],
    and compute global mean and std over all pixels, as recommended in CS231n:
    center data to mean 0 and normalize its scale.
    """
    # Load once without transforms to access raw uint8 data
    raw_train = datasets.FashionMNIST(
        root=root,
        train=True,
        download=True,
        transform=None
    )

    # raw_train.data: shape [60000, 28, 28], dtype uint8 in [0, 255]
    train_data = raw_train.data.float() / 255.0 # match ToTensor scaling

    mean = train_data.mean().item()
    std = train_data.std().item()
    return mean, std


def get_fashion_mnist_datasets(root="./data", val_ratio=0.2, seed=551):
    """
    Acquire FashionMNIST, compute normalization statistics on training set,
    and return normalized train, validation, and test datasets.

    - Uses the default 28x28 version.
    - Uses the 60k official training split for train + validation.
    - Uses the 10k official test split as test.
    """
    mean, std = compute_fashion_mnist_mean_std(root)

    train_transform = transforms.Compose([
        transforms.ToTensor(), # [0, 255] -> [0, 1]
        transforms.Normalize((mean,), (std,)) # zero mean, unit-ish variance
    ])

    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])

    full_train_dataset = datasets.FashionMNIST(
        root=root,
        train=True,
        download=True,
        transform=train_transform
    )

    test_dataset = datasets.FashionMNIST(
        root=root,
        train=False,
        download=True,
        transform=test_transform
    )

    # Split 60k training samples into train and validation
    total_train = len(full_train_dataset) # should be 60000
    val_size = int(val_ratio * total_train)
    train_size = total_train - val_size

    generator = torch.Generator().manual_seed(seed)
    train_dataset, val_dataset = random_split(
        full_train_dataset,
        [train_size, val_size],
        generator=generator
    )

    return train_dataset, val_dataset, test_dataset, mean, std


def get_fashion_mnist_loaders(
    root="./data",
    val_ratio=0.2,
    batch_size=128,
    num_workers=2,
    seed=551
):
    """
    Convenience function that wraps dataset acquisition and returns
    DataLoaders for train, validation, and test sets.
    """
    train_dataset, val_dataset, test_dataset, mean, std = get_fashion_mnist_datasets(
        root=root,
        val_ratio=val_ratio,
        seed=seed
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    return train_loader, val_loader, test_loader, mean, std


if __name__ == "__main__":
    train_loader, val_loader, test_loader, mean, std = get_fashion_mnist_loaders()

    print(f"Train batches: {len(train_loader)}")
    print(f"Validation batches: {len(val_loader)}")
    print(f"Test batches: {len(test_loader)}")
    print(f"Computed mean: {mean:.4f}, std: {std:.4f}")

    # Inspect one batch shape
    images, labels = next(iter(train_loader))
    # images shape: [batch_size, 1, 28, 28]
    print(f"Batch image tensor shape: {images.shape}")
    print(f"Batch labels tensor shape: {labels.shape}")


Train batches: 375
Validation batches: 94
Test batches: 79
Computed mean: 0.2860, std: 0.3530


Python(35540) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35541) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35542) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35543) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Batch image tensor shape: torch.Size([128, 1, 28, 28])
Batch labels tensor shape: torch.Size([128])


In [4]:
import numpy as np

def l2_loss(y, yh):
  return 0.5 * (yh - y)**2

def l2_loss_grad(y, yh):
  return yh - y

def cross_entropy(y, yh):
  return -np.sum(y * np.log(yh + 1e-12))

# note that this is true only for dL/dz, L = loss(softmax(z))
def cross_entropy_grad(y, yh):
  return yh - y

def relu(x):
  return np.maximum(0, x)
   
def relu_grad(x):
    return (x > 0).astype(float)

def leaky_relu(x, alpha=0.1):
   return np.maximum(alpha*x, x)

def leaky_relu_grad(x, alpha=0.1):
    grad = np.ones_like(x)
    grad[x < 0] = alpha
    return grad

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_grad(x):
    s = sigmoid(x)
    return s * (1 - s)
  
def tanh(x):
  return np.tanh(x)

def tanh_grad(x):
  t = np.tanh(x)
  return 1 - t * t

def linear(x):
  return x

def linear_grad(x):
  return np.ones_like(x)

def softmax(x):
  z = x - np.max(x)
  e = np.exp(z)
  return e / np.sum(e)

# MLP object:
# dims: list[int] - how many neurons in input, {hidden layers}, output
# activation_fns: list[relu|sigmoid|tanh|linear|softmax] - activation functions applied to EACH HIDDEN LAYER
# W: list[np.array(n_in x n_out)] - Random init with Normal dist.
# b: list[np.array(n_out)] - Random init with Normal dist.
# seed: float - For training reproducibility
class MLP:
  def __init__(self, dims, activation_fns, seed=None):
    self.dims = dims
    self.seed = seed
    if seed:
      np.random.seed(seed)

    dims_len = len(dims)
    activation_fns_len = len(activation_fns)
    if dims_len - 1 != activation_fns_len:
      dims_len = len(dims)
      raise RuntimeError(f"Length {dims_len} of dims does not match length {activation_fns_len} of activation_fns")
    
    self.activation_fns = activation_fns

    W_list = []
    b_list = []
    for i, dim in enumerate(dims[1:], start=1):
      W_list.append(np.random.normal(loc=0.0, scale=1.0, size=(dims[i - 1], dims[i])))
      b_list.append(np.zeros(dims[i]))
    
    self.W = W_list
    self.b = b_list

  def feed_forward(self, x):
    Z = []
    A = []
    z = None # intermediate var init
    a = np.array(x) # "input activation"
    # will contain list[x, Vx, Wf(Vx), ...]
    Z.append(x)
    # will contain list[x, fn(Vx), fn(Wf(Vx)), ...]
    A.append(x)
    for i, (W, b, fn) in enumerate(zip(self.W, self.b, self.activation_fns), start=1):
      z = a.T @ W + b
      Z.append(z)
      a = fn(z)
      A.append(a)
      z = a

    return Z, A
  
  # def train(self, X, Y, batch_size, lr, epochs):
  #   deltas = [] # deltas will be in same order as list of weight matrices/bias vectors
  #   N = X.shape[0]
  #   d = X.shape[1]
  #   num_layers = len(self.dims)
  #   train_batches = np.array_split(list(zip(X, Y)), N // batch_size)
  #   for epoch in range(epochs):
  #     for batch in train_batches:
  #       weight_update_per_batch = []
  #       bias_updates_per_batch = []
  #       for x, y in batch:
  #         deltas = []
  #         Z, A = self.feed_forward(x)

  #         last_activation_fn = self.activation_fns[len(self.activation_fns) - 1]
  #         # classification output
  #         if last_activation_fn is softmax:
  #           delta_last = A[num_layers - 1] - y
  #         # regression output
  #         else:
  #           delta_last = l2_loss_grad(A[num_layers - 1]) * activation_to_grad_map[last_activation_fn](Z[num_layers - 1])
          
  #         deltas.insert(0, delta_last)
          
  #         # move backwards (BACKprop)
  #         # note that self.W and self.activation_fns (to_second_layer, to_third_layer, ...) are 1 shorter than Z (list[x, Vx, Wf(Vx), ...])
  #         for layer in range(num_layers - 2, 0, -1):
  #           activation_fn = self.activation_fns[layer]
  #           delta = (self.W[layer].T @ deltas[0]) * activation_to_grad_map[activation_fn](Z[layer])
  #           deltas.insert(0, delta)
          
  #         weight_updates_per_instance = []
  #         bias_updates_per_instance = []
  #         for i, delta in enumerate(deltas):
  #           grad = A[i] @ delta
  #           weight_updates_per_instance.append(grad / np.linalg.norm(grad))
  #           bias_updates_per_instance.append(delta / np.linalg.norm(delta))
          
  #         weight_update_per_batch.append(weight_updates_per_instance)
  #         bias_updates_per_batch.append(bias_updates_per_instance)
        
  #       for i, (selfW, selfB) in enumerate(self.W, self.b):
  #         for (weight_batch_update, bias_batch_update) in zip(weight_update_per_batch, bias_updates_per_instance):
  #           selfW -= lr * weight_batch_update[i]
  #           selfB -= lr *bias_batch_update[i]
          
  #         self.W[i] = selfW
  #         self.b[i] = selfB

  def l1_penalty(self):
    """Sum of absolute values across a list of weight arrays."""
    total = 0.0
    for W in self.W:
        total += np.sum(np.abs(W))
    return total

  def l2_penalty(self):
      """Half the squared L2 norm across all weight arrays (0.5 * ||W||^2)."""
      total = 0.0
      for W in self.W:
          total += 0.5 * np.sum(W * W)
      return total
  
  def l1_grad(self, weights):
      """Gradient of L1 regularization: sign(W)"""
      grads = []
      for W in weights:
          grads.append(np.sign(W))
      return grads

  def l2_grad(self, weights):
      """Gradient of L2 regularization: W"""
      return weights  # Since d/dW (0.5 * W^2) = W
  

  def fit(self, X, Y, batch_size, lr, epochs, l1_lambda=0.0, l2_lambda=0.0):
    """
    - Supports SGD/minibatch/full-batch via batch_size.
    - Softmax+CE: δ_L = y_hat - y (no extra f').
    - MSE (or any other): δ_L = (y_hat - y) ⊙ f'(z_L).
    Assumes feed_forward(x) -> (Z, A) with:
      A[0] = x, Z[1..L], A[L] = y_hat
    """
    activation_to_grad_map = {
        relu: relu_grad,
        sigmoid: sigmoid_grad,
        tanh: tanh_grad,
        linear: linear_grad,
        leaky_relu: leaky_relu_grad,
    }
    
    N = X.shape[0]
    L = len(self.dims) - 1  # number of weight layers

    for epoch in range(epochs):
        print(f"Processing epoch {epoch + 1}...") if epoch % 10 == 0 else None
        # shuffle each epoch
        perm = np.random.permutation(N)
        Xs, Ys = X[perm], Y[perm]

        for start in range(0, N, batch_size):
            end = min(start + batch_size, N)
            m = end - start

            # gradient accumulators
            grad_W = [np.zeros_like(W) for W in self.W]
            grad_b = [np.zeros_like(b) for b in self.b]

            # per-sample backprop, sum then average
            for x, y in zip(Xs[start:end], Ys[start:end]):
                Z, A = self.feed_forward(x)  # A[0]=x, A[L]=ŷ; Z[1..L]
                last_act = self.activation_fns[-1]

                # delta per layer index: delta_layers[l] is deltal, for l=1..L. Means the array of deltas exactly match
                # layers (means 0th delta is None because that's for input.)
                delta_layers = [None] * (L + 1)

                # Output layer delta
                if last_act is softmax:
                    delta_layers[L] = A[-1] - y
                else:
                    dz_L = activation_to_grad_map[last_act](Z[-1])  # f'(z_L)
                    delta_layers[L] = (A[-1] - y) * dz_L

                # Hidden layers: l = L-1 .. 1
                for l in range(L - 1, 0, -1):
                    act = self.activation_fns[l - 1]           # activation after W[l-1]
                    dz = activation_to_grad_map[act](Z[l])      # f'(z_l)
                    
                    delta_layers[l] = (self.W[l] @ delta_layers[l + 1]) * dz

                # Gradients for each weight layer i=0..L-1 (maps layer i -> i+1)
                for i in range(L):
                    # ∂L/∂W[i] = A[i] (col) ⊗ δ^{i+1} (row)
                    grad_W[i] += np.outer(A[i], delta_layers[i + 1])
                    # ∂L/∂b[i] = δ^{i+1}
                    grad_b[i] += delta_layers[i + 1]

            # Apply averaged batch gradients
            inv_m = 1.0 / m
            for i in range(L):
                data_grad_W = grad_W[i] * inv_m
                data_grad_b = grad_b[i] * inv_m
                # Add regularization gradients
                if l1_lambda > 0:
                    data_grad_W += l1_lambda * np.sign(self.W[i])
                if l2_lambda > 0:
                    data_grad_W += l2_lambda * self.W[i]
                
                # Update weights
                self.W[i] -= lr * data_grad_W
                self.b[i] -= lr * data_grad_b
            

            if epoch % 10 == 0:
              avg_loss = self._compute_batch_loss(Xs[:100], Ys[:100], l1_lambda, l2_lambda)
              #print(f"  Loss (with reg): {avg_loss:.4f}")


        
        #print(self.loss(np.array([1,2,3,4,5]), np.array([1,1,1]))) if epoch % 10 == 0 else None
                
  def _compute_batch_loss(self, X_batch, Y_batch, l1_lambda=0.0, l2_lambda=0.0):
      """Compute average loss for a batch including regularization"""
      total_loss = 0.0
      for x, y in zip(X_batch, Y_batch):
          total_loss += self.loss(x, y)
      
      avg_data_loss = total_loss / len(X_batch)
      
      # Add regularization terms
      reg_loss = 0.0
      if l1_lambda > 0:
          reg_loss += l1_lambda * self.l1_penalty()
      if l2_lambda > 0:
          reg_loss += l2_lambda * self.l2_penalty() * 2  # Multiply by 2 because we use 0.5 in penalty
      
      return avg_data_loss + reg_loss
  

  def predict(self, x):
     Z, A = self.feed_forward(x)
     return np.argmax(A[-1]) # for softmax
  
  
  def loss(self, x, y, reg=None):
    last_act = self.activation_fns[-1]
    Z, A = self.feed_forward(x)
    
    # CE loss
    if last_act is softmax:
      loss = cross_entropy(y, A[-1])
    # L2 loss
    else:
      loss = np.sum(l2_loss(y, A[-1])) 
    return loss
  
  def evaluate_acc(self, X, y):
        """Calculate accuracy on dataset"""
        correct = 0
        for i in range(len(X)):
            pred = self.predict(X[i])  # Use the predict method
            if pred == y[i]:
                correct += 1
        return correct / len(X)
     

mlp = MLP([5,10,10,3], [sigmoid, linear, softmax])
mlp.fit(np.array([np.array([1,2,3,4,5])]), np.array(np.array([1,1,1])), 1, 0.001, 100)




Processing epoch 1...
Processing epoch 11...
Processing epoch 21...
Processing epoch 31...
Processing epoch 41...
Processing epoch 51...
Processing epoch 61...
Processing epoch 71...
Processing epoch 81...
Processing epoch 91...


In [5]:
# 3.1

def convert_loader_to_np(loader):
    all_images = []
    all_labels = []

    for images, labels in loader:
        images_flat = images.view(images.size(0), -1).numpy()
        labels_np = labels.numpy()
        all_images.append(images_flat)
        all_labels.append(labels_np)

    X = np.concatenate(all_images, axis=0)
    y = np.concatenate(all_labels, axis=0)

    return X, y

def labels_to_onehot(y, num_classes=10):
    """
    Convert integer labels to one-hot encoding
    """
    onehot = np.zeros((len(y), num_classes))
    onehot[np.arange(len(y)), y] = 1
    return onehot

train_loader, val_loader, test_loader, mean, std = get_fashion_mnist_loaders(
        batch_size=128, 
        num_workers=2
    )

X_train, y_train = convert_loader_to_np(train_loader)
y_train_onehot = labels_to_onehot(y_train)

X_val, y_val = convert_loader_to_np(val_loader)
y_val_onehot = labels_to_onehot(y_val)

X_test, y_test = convert_loader_to_np(test_loader)
y_test_onehot = labels_to_onehot(y_test)

mlp_no_layers = MLP([784, 10], [softmax])
mlp_no_layers.fit(X_train, y_train_onehot, 64, 0.001, 10)

train_acc1 = mlp_no_layers.evaluate_acc(X_train, y_train)
test_acc1 = mlp_no_layers.evaluate_acc(X_test, y_test)
print(f"Model 1 - Train Accuracy: {train_acc1:.4f}, Test Accuracy: {test_acc1:.4f}")

mlp_1_layer = MLP([784, 256, 10], [relu, softmax])
mlp_2_layer = MLP([784, 256, 256, 10], [relu, relu, softmax])

mlp_1_layer.fit(X_train, y_train_onehot, 64, 0.0001, 10)
train_acc2 = mlp_1_layer.evaluate_acc(X_train, y_train)
test_acc2 = mlp_1_layer.evaluate_acc(X_test, y_test)

print(f"Model 2 - Train Accuracy: {train_acc2:.4f}, Test Accuracy: {test_acc2:.4f}")

mlp_2_layer.fit(X_train, y_train_onehot, 64, 0.0001, 10)
train_acc3 = mlp_2_layer.evaluate_acc(X_train, y_train)
test_acc3 = mlp_2_layer.evaluate_acc(X_test, y_test)
print(f"Model 3 - Train Accuracy: {train_acc3:.4f}, Test Accuracy: {test_acc3:.4f}")

Python(35544) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35545) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35546) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35547) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35549) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35550) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Processing epoch 1...
Model 1 - Train Accuracy: 0.5963, Test Accuracy: 0.5950
Processing epoch 1...
Model 2 - Train Accuracy: 0.6715, Test Accuracy: 0.6631
Processing epoch 1...
Model 3 - Train Accuracy: 0.7964, Test Accuracy: 0.7779


In [14]:
#3.2

last_model_redo = MLP([784, 256, 256, 10], [tanh, leaky_relu, softmax])
last_model_redo.fit(X_train, y_train_onehot, 64, 0.001, 10)
test_acc = last_model_redo.evaluate_acc(X_test, y_test)
print(f"New model - Test accuracy: {test_acc:.4f}")

Processing epoch 1...
New model - Test accuracy: 0.7303


3.2 Analysis:
With a low learning rate (0.0001), the new model performed worse than the original (accuracy 0.589). This is likely because the gradients from tanh already cause a diminished shift in the weights when learning.
We increase the learning rate to 0.001 to verify our assumption that learning rate should be increased when using a network with sigmoid or tanh, since their gradients are very small.

Our intuition is confirmed -> with a higher learning rate of 0.001, our test accuracy jumps to 0.7303!




In [13]:
# 3.3

model1 = MLP([784, 256, 256, 10], [relu, relu, softmax])
model1.fit(X_train, y_train_onehot, 64, 0.0001, 10, l1_lambda=0.001)
test_acc = model1.evaluate_acc(X_test, y_test)
print(f"L1 - Test accuracy: {test_acc:.4f}")

model2 = MLP([784, 256, 256, 10], [relu, relu, softmax])
model2.fit(X_train, y_train_onehot, 64, 0.0001, 10, l2_lambda=0.001)
test_acc = model2.evaluate_acc(X_test, y_test)
print(f"L2 - Test accuracy: {test_acc:.4f}")


Processing epoch 1...
New model - Test accuracy: 0.7732
Processing epoch 1...
New model - Test accuracy: 0.7742


3.3 analysis
With regularization, our model performed similarly to without. This is likely because our model does not overfit.
If, on the other hand, our model performed better on the test set with regularization, that would indicate that our model without regularization overfitted to the training data.
Conclusion: Regularization decreases model accuracy very slightly IF it does not overfit. However, if it does overfit, it can increase test accuracy greatly.

In [6]:
#3.4

# 3.4 - Train on unregularized images
import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, random_split

def get_unregularized_fashion_mnist_loaders(batch_size=128, val_ratio=0.2, seed=551):
    """
    Load FashionMNIST without normalization for question 3.4
    """
    # Simple transform that only converts to tensor (no normalization)
    basic_transform = transforms.Compose([
        transforms.ToTensor()  # Only converts to [0,1], no normalization
    ])
    
    full_train_dataset = datasets.FashionMNIST(
        root="./data",
        train=True,
        download=True,
        transform=basic_transform
    )

    test_dataset = datasets.FashionMNIST(
        root="./data",
        train=False,
        download=True,
        transform=basic_transform
    )

    # Split train into train/val
    total_train = len(full_train_dataset)
    val_size = int(val_ratio * total_train)
    train_size = total_train - val_size

    generator = torch.Generator().manual_seed(seed)
    train_dataset, val_dataset = random_split(
        full_train_dataset,
        [train_size, val_size],
        generator=generator
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )

    return train_loader, val_loader, test_loader

#unregularized data
train_loader_unreg, val_loader_unreg, test_loader_unreg = get_unregularized_fashion_mnist_loaders()

# Convert to numpy arrays
X_train_unreg, y_train_unreg = convert_loader_to_np(train_loader_unreg)
y_train_onehot_unreg = labels_to_onehot(y_train_unreg)

X_test_unreg, y_test_unreg = convert_loader_to_np(test_loader_unreg)

# Train model on unregularized data
model_unreg = MLP([784, 256, 256, 10], [relu, relu, softmax])
model_unreg.fit(X_train_unreg, y_train_onehot_unreg, 64, 0.0001, 10)

# Evaluate
train_acc_unreg = model_unreg.evaluate_acc(X_train_unreg, y_train_unreg)
test_acc_unreg = model_unreg.evaluate_acc(X_test_unreg, y_test_unreg)

print(f"Unregularized Model - Train Accuracy: {train_acc_unreg:.4f}, Test Accuracy: {test_acc_unreg:.4f}")

# Compare with regularized (normalized) model from 3.1
print(f"Regularized Model (from 3.1) - Test Accuracy: {test_acc3:.4f}")

Python(35611) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35612) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35613) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35614) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Processing epoch 1...
Unregularized Model - Train Accuracy: 0.7549, Test Accuracy: 0.7432
Regularized Model (from 3.1) - Test Accuracy: 0.7779


3.4 Analysis:
the unregularized model performed worse than the regularized one. This is expected as regularization of data improves the dataset overall, which improves the training quality.


In [10]:
# 3.5 - Data Augmentation

def get_augmented_fashion_mnist_loaders(batch_size=128, val_ratio=0.2, seed=551):
    """
    Load FashionMNIST with data augmentation for training
    """
    # Compute mean and std for normalization (using your existing function)
    mean, std = compute_fashion_mnist_mean_std()
    
    # Augmentation transforms for training
    train_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),      # Random flip
        transforms.RandomRotation(10),                # Random rotation ±10 degrees
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),  # Random translation
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])
    
    # Standard transforms for validation/test (no augmentation)
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])
    
    full_train_dataset = datasets.FashionMNIST(
        root="./data",
        train=True,
        download=True,
        transform=train_transform  # Use augmented transforms for training
    )

    test_dataset = datasets.FashionMNIST(
        root="./data",
        train=False,
        download=True,
        transform=test_transform
    )

    # Split train into train/val
    total_train = len(full_train_dataset)
    val_size = int(val_ratio * total_train)
    train_size = total_train - val_size

    generator = torch.Generator().manual_seed(seed)
    train_dataset, val_dataset = random_split(
        full_train_dataset,
        [train_size, val_size],
        generator=generator
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )

    return train_loader, val_loader, test_loader

# Load augmented data
train_loader_aug, val_loader_aug, test_loader_aug = get_augmented_fashion_mnist_loaders()

# Convert to numpy arrays
X_train_aug, y_train_aug = convert_loader_to_np(train_loader_aug)
y_train_onehot_aug = labels_to_onehot(y_train_aug)

X_test_aug, y_test_aug = convert_loader_to_np(test_loader_aug)

# Train model on augmented data (using same architecture as 3.1)
model_aug = MLP([784, 256, 256, 10], [relu, relu, softmax])
model_aug.fit(X_train_aug, y_train_onehot_aug, 64, 0.0001, 40)

# Evaluate
train_acc_aug = model_aug.evaluate_acc(X_train_aug, y_train_aug)
test_acc_aug = model_aug.evaluate_acc(X_test_aug, y_test_aug)

print(f"Augmented Model - Train Accuracy: {train_acc_aug:.4f}, Test Accuracy: {test_acc_aug:.4f}")
print(f"Regular Model (from 3.1) - Train Accuracy: {train_acc3:.4f}, Test Accuracy: {test_acc3:.4f}")

Python(36117) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(36118) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(36119) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(36120) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Processing epoch 1...
Processing epoch 11...
Processing epoch 21...


3.5: BROKEN, AUGMENTED DATA DOES WORSE??