In [1]:
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch
import wandb

random.seed(27)
np.random.seed(27)
plt.style.use('ggplot')

# Section 1: Pilot study of subset, used to find feasible hyperparameters

### Dataset - A (1/10) subset of cifar

We will choose a subset of the cifar10, and make sure there is class balance

In [2]:
ds = "cifar10"
transform = transforms.Compose([transforms.ToTensor()])

if ds == "cifar10":
    train_ds = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
    test_ds = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)

    X_train = train_ds.data.astype(np.float32) / 255.0   
    y_train = np.array(train_ds.targets, dtype=np.int64)

    X_test = test_ds.data.astype(np.float32) / 255.0
    y_test = np.array(test_ds.targets, dtype=np.int64)

    # flatten to (N, D)
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_test = X_test.reshape(X_test.shape[0], -1)

elif ds == "mnist":
    mnist_train = datasets.MNIST(root="./data", train=True, download=True)
    mnist_test = datasets.MNIST(root="./data", train=False, download=True)
    X_train = mnist_train.data.numpy().astype(np.float32) / 255.0
    y_train = mnist_train.targets.numpy().astype(np.int64)
    X_test  = mnist_test.data.numpy().astype(np.float32) / 255.0
    y_test  = mnist_test.targets.numpy().astype(np.int64)
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_test  = X_test.reshape(X_test.shape[0], -1)



subset_ratio = 0.2      #20% of each class
num_classes_total = 10  
num_classes = 3         

def balanced_subset(X, y, ratio, classes_to_take):
    indices = []
    for c in range(classes_to_take):
        class_idx = np.where(y == c)[0]
        n_samples = max(1, int(len(class_idx) * ratio))
        chosen = np.random.choice(class_idx, n_samples, replace=False)
        indices.append(chosen)
    indices = np.concatenate(indices)
    np.random.shuffle(indices)
    return X[indices], y[indices]

X_train, y_train = balanced_subset(X_train, y_train, subset_ratio, num_classes)
X_test,  y_test  = balanced_subset(X_test,  y_test,  subset_ratio, num_classes)

def one_hot_encode(y, num_classes):
    y = np.asarray(y, dtype=int)
    m = y.size
    Y = np.zeros((num_classes, m), dtype=float)
    Y[y, np.arange(m)] = 1.0
    return Y

Y_train = one_hot_encode(y_train, num_classes)
Y_test = one_hot_encode(y_test, num_classes)

print("Shapes:", X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

Shapes: (3000, 3072) (3, 3000) (600, 3072) (3, 600)


### Pilot study - we search for a feasible range of hyperparameters using sweeps

In [3]:
from NN.denseLayer import *
from NN.loss import *
from NN.optimizer import *
from NN.activations import *

> Setup WANDB tracker, where we will perform a simple grid search of a VERY broad search

In [4]:
import matplotlib.pyplot as plt
import os
import wandb
from getAPI import retrieveApi
from sklearn.model_selection import KFold

os.environ['WANDB_API_KEY'] = retrieveApi()

> Do hyperparameter sweep, on a very small subset of cifar-10

The code is based on the tutorials given in the sweeps documentation:
- https://docs.wandb.ai/models/sweeps/define-sweep-configuration#python-script-or-notebook
- https://docs.wandb.ai/models/tutorials/sweeps

We are doing a random search, where we try and get an idea of what values work and what does not, maybe we can rule out some ranges, which will make the coarse hyperparameter search better

In [5]:
def accuracy_from_probs(A, Y_true_onehot):
    preds = A.argmax(axis=0)
    truths = Y_true_onehot.argmax(axis=0)
    return float((preds == truths).mean())

def get_activation_instance(name):
    if name == "relu":
        return ReLU()
    elif name == "tanh":
        return Tanh()
    elif name == "sigmoid":
        return Sigmoid()

def get_optimizer(name, lr):
    if name == "sgd":
        return SGD(learning_rate=lr)
    elif name == "adam":
        return Adam(learning_rate=lr)


def build_layers(input_dim, cfg):

    layers = []
    prev = input_dim

    for _ in range(int(cfg['num_hidden_layers'])):
        act = get_activation_instance(cfg['activation'])
        layers.append(DenseLayer(prev, int(cfg['n_hidden_units']), activation=act,
                                 initializer=HeInitializer(), l2_coeff=float(cfg.get('l2_coeff', 0.0))))
        prev = int(cfg['n_hidden_units'])

    # final layer with Softmax and small-normal initializer
    layers.append(DenseLayer(prev, int(cfg['num_classes']), activation=Softmax(),
                             initializer=NormalInitializer(mean=0.0, std=0.01), l2_coeff=0.0))
    return layers


sweep_configuration = {
    "method": "random",
    "metric": {"name": "mean_cv_accuracy", "goal": "maximize"},
    "parameters": {
        "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1.0},
        "batch_size": {"values": [32, 64, 128]},
        "epochs": {"value": 30},
        "num_hidden_layers": {"distribution": "int_uniform", "min": 1, "max": 5},
        "n_hidden_units": {"distribution": "int_uniform", "min": 32, "max": 128},
        "l2_coeff": {"distribution": "uniform", "min": 0.0, "max": 0.001},
        "optimizer": {"values": ["sgd", "adam"]},           
        "activation": {"values": ["relu", "tanh", "sigmoid"]},
        "num_classes": {"value": 3},                 
    },
}

# create the sweep (do this once; it returns a sweep_id)
sweep_id = wandb.sweep(sweep=sweep_configuration, project="numpy_nn")



def train_on_split(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, cfg):
    """
    Train a model on one train/val split.
    Returns final validation accuracy.
    """

    layers = build_layers(X_train_fold.shape[1], cfg)
    optimizer = get_optimizer(cfg['optimizer'], float(cfg['learning_rate']))
    loss_fn = CrossEntropyLoss()

    m = X_train_fold.shape[0]
    epochs = int(cfg["epochs"])
    batch_size = int(cfg["batch_size"])

    for epoch in range(epochs):
        perm = np.random.permutation(m)

        Xs = X_train_fold[perm].T     # (in_dim, m)
        Ys = Y_train_fold[:, perm]    # (num_classes, m)

        for i in range(0, m, batch_size):
            Xb = Xs[:, i:i+batch_size]
            Yb = Ys[:, i:i+batch_size]

            # forward
            A = Xb
            for layer in layers:
                A = layer.forward(A)

            # loss + backward
            loss = loss_fn.forward(A, Yb)
            dA = loss_fn.backward(A, Yb)

            # backprop
            for layer in reversed(layers):
                dA, dW, db = layer.backward(dA)
                optimizer.update(layer, dW, db)

    # ---- compute validation accuracy ----
    A = X_val_fold.T
    for layer in layers:
        A = layer.forward(A)

    val_acc = accuracy_from_probs(A, Y_val_fold)
    return val_acc


from sklearn.model_selection import KFold

def train_on_split(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, cfg):
    """
    Train a model on one train/val split.
    Returns final validation accuracy.
    """

    layers = build_layers(X_train_fold.shape[1], cfg)
    optimizer = get_optimizer(cfg['optimizer'], float(cfg['learning_rate']))
    loss_fn = CrossEntropyLoss()

    m = X_train_fold.shape[0]
    epochs = int(cfg["epochs"])
    batch_size = int(cfg["batch_size"])

    for epoch in range(epochs):
        perm = np.random.permutation(m)

        Xs = X_train_fold[perm].T     # (in_dim, m)
        Ys = Y_train_fold[:, perm]    # (num_classes, m)

        for i in range(0, m, batch_size):
            Xb = Xs[:, i:i+batch_size]
            Yb = Ys[:, i:i+batch_size]

            # forward
            A = Xb
            for layer in layers:
                A = layer.forward(A)

            # loss + backward
            loss = loss_fn.forward(A, Yb)
            dA = loss_fn.backward(A, Yb)

            # backprop
            for layer in reversed(layers):
                dA, dW, db = layer.backward(dA)
                optimizer.update(layer, dW, db)

    # ---- compute validation accuracy ----
    A = X_val_fold.T
    for layer in layers:
        A = layer.forward(A)

    val_acc = accuracy_from_probs(A, Y_val_fold)
    return val_acc

def train_run():

    with wandb.init() as run:
        cfg = dict(run.config)
        run.log(dict(cfg))

        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        fold_accuracies = []

        # K-fold loop
        for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train)):

            X_train_fold = X_train[train_idx]
            Y_train_fold = Y_train[:, train_idx]

            X_val_fold = X_train[val_idx]
            Y_val_fold = Y_train[:, val_idx]

            val_acc = train_on_split(
                X_train_fold, Y_train_fold,
                X_val_fold, Y_val_fold,
                cfg
            )

            fold_accuracies.append(val_acc)
            run.log({f"fold_{fold_idx+1}_accuracy": float(val_acc)})

        # final metric for sweep
        mean_cv_acc = float(np.mean(fold_accuracies))
        run.log({"mean_cv_accuracy": mean_cv_acc})

        print(f"Run {run.id}  CV Accuracies={fold_accuracies}  Mean={mean_cv_acc:.4f}")

        wandb.finish()

wandb.agent(sweep_id, function=train_run, count=10)

Create sweep with ID: rhd00nb0
Sweep URL: https://wandb.ai/xanderbaatz-danmarks-tekniske-universitet-dtu/numpy_nn/sweeps/rhd00nb0


[34m[1mwandb[0m: Agent Starting Run: 14sqivwa with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	l2_coeff: 0.0008442830132521463
[34m[1mwandb[0m: 	learning_rate: 0.5917482247650961
[34m[1mwandb[0m: 	n_hidden_units: 110
[34m[1mwandb[0m: 	num_classes: 3
[34m[1mwandb[0m: 	num_hidden_layers: 2
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: Currently logged in as: [33mglymse[0m ([33mxanderbaatz-danmarks-tekniske-universitet-dtu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  self.A = 1 / (1 + np.exp(-Z))
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run 14sqivwa  CV Accuracies=[0.34, 0.35, 0.343]  Mean=0.3443


0,1
batch_size,▁
epochs,▁
fold_1_accuracy,▁
fold_2_accuracy,▁
fold_3_accuracy,▁
l2_coeff,▁
learning_rate,▁
mean_cv_accuracy,▁
n_hidden_units,▁
num_classes,▁

0,1
activation,sigmoid
batch_size,32
epochs,30
fold_1_accuracy,0.34
fold_2_accuracy,0.35
fold_3_accuracy,0.343
l2_coeff,0.00084
learning_rate,0.59175
mean_cv_accuracy,0.34433
n_hidden_units,110


[34m[1mwandb[0m: Agent Starting Run: gs64r0rz with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	l2_coeff: 0.00013821769179680633
[34m[1mwandb[0m: 	learning_rate: 0.3673393167973977
[34m[1mwandb[0m: 	n_hidden_units: 88
[34m[1mwandb[0m: 	num_classes: 3
[34m[1mwandb[0m: 	num_hidden_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run gs64r0rz  CV Accuracies=[0.638, 0.601, 0.58]  Mean=0.6063


0,1
batch_size,▁
epochs,▁
fold_1_accuracy,▁
fold_2_accuracy,▁
fold_3_accuracy,▁
l2_coeff,▁
learning_rate,▁
mean_cv_accuracy,▁
n_hidden_units,▁
num_classes,▁

0,1
activation,relu
batch_size,128
epochs,30
fold_1_accuracy,0.638
fold_2_accuracy,0.601
fold_3_accuracy,0.58
l2_coeff,0.00014
learning_rate,0.36734
mean_cv_accuracy,0.60633
n_hidden_units,88


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: hh74wgc8 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	l2_coeff: 0.000836407654860607
[34m[1mwandb[0m: 	learning_rate: 0.7970768969206561
[34m[1mwandb[0m: 	n_hidden_units: 73
[34m[1mwandb[0m: 	num_classes: 3
[34m[1mwandb[0m: 	num_hidden_layers: 1
[34m[1mwandb[0m: 	optimizer: sgd


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run hh74wgc8  CV Accuracies=[0.609, 0.556, 0.624]  Mean=0.5963


0,1
batch_size,▁
epochs,▁
fold_1_accuracy,▁
fold_2_accuracy,▁
fold_3_accuracy,▁
l2_coeff,▁
learning_rate,▁
mean_cv_accuracy,▁
n_hidden_units,▁
num_classes,▁

0,1
activation,sigmoid
batch_size,64
epochs,30
fold_1_accuracy,0.609
fold_2_accuracy,0.556
fold_3_accuracy,0.624
l2_coeff,0.00084
learning_rate,0.79708
mean_cv_accuracy,0.59633
n_hidden_units,73


[34m[1mwandb[0m: Agent Starting Run: 86ns4cp0 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	l2_coeff: 0.00039597101013171855
[34m[1mwandb[0m: 	learning_rate: 0.5268014390594281
[34m[1mwandb[0m: 	n_hidden_units: 69
[34m[1mwandb[0m: 	num_classes: 3
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run 86ns4cp0  CV Accuracies=[0.425, 0.579, 0.669]  Mean=0.5577


0,1
batch_size,▁
epochs,▁
fold_1_accuracy,▁
fold_2_accuracy,▁
fold_3_accuracy,▁
l2_coeff,▁
learning_rate,▁
mean_cv_accuracy,▁
n_hidden_units,▁
num_classes,▁

0,1
activation,relu
batch_size,32
epochs,30
fold_1_accuracy,0.425
fold_2_accuracy,0.579
fold_3_accuracy,0.669
l2_coeff,0.0004
learning_rate,0.5268
mean_cv_accuracy,0.55767
n_hidden_units,69


[34m[1mwandb[0m: Agent Starting Run: fzakga3d with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	l2_coeff: 0.0006699626405953712
[34m[1mwandb[0m: 	learning_rate: 0.528381200702156
[34m[1mwandb[0m: 	n_hidden_units: 99
[34m[1mwandb[0m: 	num_classes: 3
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run fzakga3d  CV Accuracies=[0.34, 0.317, 0.33]  Mean=0.3290


0,1
batch_size,▁
epochs,▁
fold_1_accuracy,▁
fold_2_accuracy,▁
fold_3_accuracy,▁
l2_coeff,▁
learning_rate,▁
mean_cv_accuracy,▁
n_hidden_units,▁
num_classes,▁

0,1
activation,relu
batch_size,64
epochs,30
fold_1_accuracy,0.34
fold_2_accuracy,0.317
fold_3_accuracy,0.33
l2_coeff,0.00067
learning_rate,0.52838
mean_cv_accuracy,0.329
n_hidden_units,99


[34m[1mwandb[0m: Agent Starting Run: 2bai9irb with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	l2_coeff: 0.00017238405670867197
[34m[1mwandb[0m: 	learning_rate: 0.6261509735028913
[34m[1mwandb[0m: 	n_hidden_units: 82
[34m[1mwandb[0m: 	num_classes: 3
[34m[1mwandb[0m: 	num_hidden_layers: 4
[34m[1mwandb[0m: 	optimizer: adam


  self.A = 1 / (1 + np.exp(-Z))
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run 2bai9irb  CV Accuracies=[0.34, 0.333, 0.327]  Mean=0.3333


0,1
batch_size,▁
epochs,▁
fold_1_accuracy,▁
fold_2_accuracy,▁
fold_3_accuracy,▁
l2_coeff,▁
learning_rate,▁
mean_cv_accuracy,▁
n_hidden_units,▁
num_classes,▁

0,1
activation,sigmoid
batch_size,32
epochs,30
fold_1_accuracy,0.34
fold_2_accuracy,0.333
fold_3_accuracy,0.327
l2_coeff,0.00017
learning_rate,0.62615
mean_cv_accuracy,0.33333
n_hidden_units,82


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4ax40jlk with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	l2_coeff: 0.0003928648513476744
[34m[1mwandb[0m: 	learning_rate: 0.572117577466804
[34m[1mwandb[0m: 	n_hidden_units: 55
[34m[1mwandb[0m: 	num_classes: 3
[34m[1mwandb[0m: 	num_hidden_layers: 2
[34m[1mwandb[0m: 	optimizer: sgd


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Traceback (most recent call last):
  File "/tmp/ipykernel_79893/424036661.py", line 179, in train_run
    run.log({f"fold_{fold_idx+1}_accuracy": float(val_acc)})
  File "/home/vscode/.local/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 390, in wrapper
    return func(self, *args, **kwargs)
  File "/home/vscode/.local/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 448, in wrapper_fn
    return func(self, *args, **kwargs)
  File "/home/vscode/.local/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 435, in wrapper
    return func(self, *args, **kwargs)
  File "/home/vscode/.local/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 2023, in log
    self._log(data=data, step=step, commit=commit)
  File "/home/vscode/.local/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 1734, in _log
    self._partial_history_callback(data, step, commit)
  File "/home/vscode/.local/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 390, in wrappe