In [1]:
import torch
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.datasets import mnist
from copy import deepcopy

In [2]:
"""!pip install wandb
import wandb
wandb.login()"""

Collecting wandb
  Downloading wandb-0.17.5-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.11.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.17.5-py3-none-man

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

#Loading MNIST

In [3]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [4]:
val_start = int(train_X.shape[0] * 0.9)
val_X = train_X[val_start:]
val_y = train_y[val_start:]
train_X = train_X[:val_start]
train_y = train_y[:val_start]
print(train_X.shape)
print(val_X.shape)

(54000, 28, 28)
(6000, 28, 28)


In [5]:
train_X = train_X.reshape((train_X.shape[0], -1))
test_X = test_X.reshape((test_X.shape[0], -1))
val_X = val_X.reshape((val_X.shape[0], -1))
train_X = train_X.astype('float32') / 255
test_X = test_X.astype('float32') / 255
val_X = val_X.astype('float32') / 255

train_y = to_categorical(train_y, num_classes=10)
val_y = to_categorical(val_y, num_classes=10)

In [6]:
#Hyperparameters
batch_size = 32
num_epochs = 300
hidden_layers = [512, 256, 128]
act_fn = 'r' #'s' for sigmoid, 'r' for ReLU
lr = 0.01
mr = 0.9

In [7]:
def batch_generator(X, y, batch_size=batch_size):
    """Generate batches of data."""
    n_samples = X.shape[0]
    # Loop over the data in increments of batch_size
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        yield X[start:end], y[start:end]

#MLP Model

In [8]:
#Math Functions

def relu(z):
  return np.maximum(0, z)

def relu_deriv(z):
  return (z>0).astype(float)

def sigmoid(z):
  return 1 / (1 + np.exp(-z))

def sigmoid_deriv(z):
  s = sigmoid(z)
  return s * (1 - s)

def softmax(z):
  z_max = np.max(z, axis=1, keepdims=True)
  e_z = np.exp(z - z_max)
  return e_z / np.sum(e_z, axis=1, keepdims=True)

def celoss(pred, tru):
  loss = -np.sum(tru * np.log(pred))
  return loss

def celoss_deriv(a, y):
  return a - y

In [9]:
class MLP:
  def __init__(self):
    self.w = {}
    self.b = {}
    self.layer_sizes = [28*28] + hidden_layers + [10]
    self.prev_dw = {}
    self.prev_db = {}

    #initialize weights
    for i in range(len(self.layer_sizes) - 1):

      n_in = self.layer_sizes[i]
      n_out = self.layer_sizes[i + 1]

      if act_fn == 's':
        #Xavier initialization
        self.act = sigmoid
        self.act_deriv = sigmoid_deriv
        weight_matrix = np.random.randn(n_in, n_out)  * np.sqrt(2 / (n_in + n_out))
      else:
        #He initialization
        self.act = relu
        self.act_deriv = relu_deriv
        weight_matrix = np.random.randn(n_in, n_out)  * np.sqrt(2 / n_in)

      self.w[i] = (weight_matrix)
      self.b[i] = (np.zeros((1, n_out)))


  def forward(self, x):
    act = self.act
    z = {}
    a = {}
    a[0] = x

    for i in range(len(self.layer_sizes) - 1):
      idx = i+1
      if idx == len(self.layer_sizes) - 1:
        act = softmax
      z[idx] = a[i] @ self.w[i] + self.b[i]
      a[idx] = act(z[idx])

    return z, a

  def backprop(self, z, a, y):

    dz = None

    for i in range(len(self.layer_sizes)-1):
      idx = len(self.layer_sizes) - 2 - i

      #softmax + crossentropy derivative
      if i == 0:
        dz = celoss_deriv(a[idx + 1], y) #(B, 10)

      db = np.mean(dz, axis = 0)
      dw = a[idx].T @ dz
      dw /= batch_size

      if idx in self.prev_db:
        self.b[idx] -= lr * db + mr * self.prev_db[idx]
        self.w[idx] -= lr * dw + mr * self.prev_dw[idx]

        self.prev_db[idx] = lr * db + mr * self.prev_db[idx]
        self.prev_dw[idx] = lr * dw + mr * self.prev_dw[idx]

      else:
        self.b[idx] -= lr * db
        self.w[idx] -= lr * dw

        self.prev_db[idx] = lr * db
        self.prev_dw[idx] = lr * dw



      if idx != 0:
        da = dz @ self.w[idx].T
        dz = np.multiply(self.act_deriv(z[idx]), da)





In [10]:

"""wandb.init(
      # Set the project where this run will be logged
      project="MLP MNIST report",
      name=f"{mr}",
      # Track hyperparameters and run metadata
      config={
      "activation_fn": act_fn,
      "hidden_layers": hidden_layers,
      "learning_rate": lr,
      "epochs": num_epochs,
      "Momentum rate:": mr
      })"""


#training
model = MLP()

best_val_loss = np.inf
patience = 5

# Variables to hold the best model state
best_w = None
best_b = None

for epoch in range(num_epochs):

  indices = np.arange(train_X.shape[0])
  np.random.shuffle(indices)

  train_X = train_X[indices]
  train_y = train_y[indices]

  running_loss = 0

  for X_batch, y_batch in batch_generator(train_X, train_y, batch_size):

    z,a = model.forward(X_batch)
    running_loss += celoss(a[len(a)-1], y_batch)
    model.backprop(z, a, y_batch)

  val_loss = 0
  for X_batch, y_batch in batch_generator(val_X, val_y, batch_size):
    z,a = model.forward(X_batch)
    val_loss += celoss(a[len(a)-1], y_batch)

  #wandb.log({"Train loss": running_loss/train_X.shape[0], "Val loss": val_loss/val_X.shape[0]})

  print(f"Epoch {epoch+1}/{num_epochs} done. Train loss: {running_loss/train_X.shape[0]}, Val loss: {val_loss/val_X.shape[0]}")


  if val_loss/val_X.shape[0] < best_val_loss:
    best_val_loss = val_loss/val_X.shape[0]
    best_w = deepcopy(model.w)
    best_b = deepcopy(model.b)
    epochs_no_improve = 0
  else:
    epochs_no_improve += 1
    print(f"No improvement in validation loss for {epochs_no_improve} epochs. At Epoch {epoch+1}/{num_epochs}.")

  if epochs_no_improve == patience:
    print("Early Stopping Triggered!")
    break

model.w = best_w
model.b = best_b
#testing
num_correct = 0
num_samples = 0
for X_batch, y_batch in batch_generator(test_X, test_y, batch_size):
  z,a = model.forward(X_batch)
  predictions = np.argmax(a[len(a)-1], axis = 1)
  num_correct += np.sum(predictions == y_batch)
  num_samples += y_batch.shape[0]

accuracy = num_correct/num_samples
print(f"Model Accuracy: {accuracy*100:.2f}%")
#wandb.log({"Test Accuracy": accuracy*100})
#wandb.finish()

Epoch 1/300 done. Train loss: 0.2499174865422391, Val loss: 0.11736220066655265
Epoch 2/300 done. Train loss: 0.09943345525478614, Val loss: 0.07670907451823245
Epoch 3/300 done. Train loss: 0.06426260985111101, Val loss: 0.0739959684863432
Epoch 4/300 done. Train loss: 0.04340313563868715, Val loss: 0.08476602565743493
No improvement in validation loss for 1 epochs. At Epoch 4/300.
Epoch 5/300 done. Train loss: 0.03381901205409014, Val loss: 0.08583889845067426
No improvement in validation loss for 2 epochs. At Epoch 5/300.
Epoch 6/300 done. Train loss: 0.021767626242125936, Val loss: 0.07812265469311967
No improvement in validation loss for 3 epochs. At Epoch 6/300.
Epoch 7/300 done. Train loss: 0.019164980913186195, Val loss: 0.0731733007376594
Epoch 8/300 done. Train loss: 0.0152209462356251, Val loss: 0.0846726926244773
No improvement in validation loss for 1 epochs. At Epoch 8/300.
Epoch 9/300 done. Train loss: 0.009307057416810868, Val loss: 0.07360747764052031
No improvement in