In [1]:
import torch
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.datasets import mnist
from copy import deepcopy

In [2]:
#Hyperparameters
batch_size = 64
num_epochs = 500
hidden_layers = [128, 64]
act_fn = 's' #sigmoid or relu
lr = 0.01

#Loading MNIST

In [3]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [4]:
val_start = int(train_X.shape[0] * 0.9)
val_X = train_X[val_start:]
val_y = train_y[val_start:]
train_X = train_X[:val_start]
train_y = train_y[:val_start]
print(train_X.shape)
print(val_X.shape)

(54000, 28, 28)
(6000, 28, 28)


In [5]:
train_X = train_X.reshape((train_X.shape[0], -1))
test_X = test_X.reshape((test_X.shape[0], -1))
val_X = val_X.reshape((val_X.shape[0], -1))
train_X = train_X.astype('float32') / 255
test_X = test_X.astype('float32') / 255
val_X = val_X.astype('float32') / 255

train_y = to_categorical(train_y, num_classes=10)
val_y = to_categorical(val_y, num_classes=10)

In [6]:
def batch_generator(X, y, batch_size=batch_size):
    """Generate batches of data."""
    n_samples = X.shape[0]
    # Loop over the data in increments of batch_size
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        yield X[start:end], y[start:end]

#MLP Model

In [7]:
#Math Functions

def relu(z):
  return np.maximum(0, z)

def relu_deriv(z):
  return (z>0).astype(float)

def sigmoid(z):
  return 1 / (1 + np.exp(-z))

def sigmoid_deriv(z):
  s = sigmoid(z)
  return s * (1 - s)

def softmax(z):
  z_max = np.max(z, axis=1, keepdims=True)
  e_z = np.exp(z - z_max)
  return e_z / np.sum(e_z, axis=1, keepdims=True)

def celoss(pred, tru):
  loss = -np.sum(tru * np.log(pred))
  return loss

def celoss_deriv(a, y):
  return a - y

In [8]:
class MLP:
  def __init__(self):
    self.w = {}
    self.b = {}
    self.layer_sizes = [28*28] + hidden_layers + [10]

    #initialize weights
    for i in range(len(self.layer_sizes) - 1):

      n_in = self.layer_sizes[i]
      n_out = self.layer_sizes[i + 1]

      if act_fn == 's':
        #Xavier initialization
        self.act = sigmoid
        self.act_deriv = sigmoid_deriv
        weight_matrix = np.random.randn(n_in, n_out)  * np.sqrt(2 / (n_in + n_out))
      else:
        #He initialization
        self.act = relu
        self.act_deriv = relu_deriv
        weight_matrix = np.random.randn(n_in, n_out)  * np.sqrt(2 / n_in)

      self.w[i] = (weight_matrix)
      self.b[i] = (np.zeros((1, n_out)))


  def forward(self, x):
    act = self.act
    z = {}
    a = {}
    a[0] = x

    for i in range(len(self.layer_sizes) - 1):
      idx = i+1
      if idx == len(self.layer_sizes) - 1:
        act = softmax
      z[idx] = a[i] @ self.w[i] + self.b[i]
      a[idx] = act(z[idx])
    return z, a

  def backprop(self, z, a, y):
    dz3 = celoss_deriv(a[3], y) #(B, 10)
    db2 = np.mean(dz3, axis = 0) #(B, 10)
    dw2 = a[2].T @ dz3 #(64,10)
    dw2/= y.shape[0] #(64,10)

    self.b[2] -= lr * db2
    self.w[2] -= lr * dw2

    da2 = dz3 @ self.w[2].T #(B, 64)
    dz2 = np.multiply(self.act_deriv(z[2]), da2) #(B, 64)

    db1 = np.mean(dz2, axis=0) #(B, 64)
    dw1 = a[1].T @ dz2 #(128, 64)
    dw1/= y.shape[0]

    self.b[1] -= lr * db1
    self.w[1] -= lr * dw1

    da1 = dz2 @ self.w[1].T #(B, 128)
    dz1 = np.multiply(self.act_deriv(z[1]), da1) #(B, 128)

    db0 = np.mean(dz1, axis=0) #(B, 128)
    dw0 = a[0].T @ dz1 #(784, 128)
    dw0 /= y.shape[0]

    self.b[0] -= lr * db0
    self.w[0] -= lr * dw0




In [9]:
#training
model = MLP()

best_val_loss = np.inf
patience = 5

# Variables to hold the best model state
best_w = None
best_b = None

for epoch in range(num_epochs):

  indices = np.arange(train_X.shape[0])
  np.random.shuffle(indices)

  train_X = train_X[indices]
  train_y = train_y[indices]

  running_loss = 0
  for X_batch, y_batch in batch_generator(train_X, train_y, batch_size):

    z,a = model.forward(X_batch)
    running_loss += celoss(a[3], y_batch)
    model.backprop(z, a, y_batch)

  val_loss = 0
  for X_batch, y_batch in batch_generator(val_X, val_y, batch_size):
    z,a = model.forward(X_batch)
    val_loss += celoss(a[3], y_batch)

  if (epoch+1)%10 == 0 or epoch == 0:
    print(f"Epoch {epoch+1}/{num_epochs} done. Train loss: {running_loss/train_X.shape[0]}, Val loss: {val_loss/val_X.shape[0]}")

  if val_loss/val_X.shape[0] < best_val_loss:
    best_val_loss = val_loss/val_X.shape[0]
    best_w = deepcopy(model.w)
    best_b = deepcopy(model.b)
    epochs_no_improve = 0
  else:
    epochs_no_improve += 1
    print(f"No improvement in validation loss for {epochs_no_improve} epochs. At Epoch {epoch+1}/{num_epochs}.")

  if epochs_no_improve == patience:
    print("Early Stopping Triggered!")
    break

model.w = best_w
model.b = best_b

Epoch 10/500 done. Train loss: 0.6759495455081844, Val loss: 0.577732324012872
Epoch 20/500 done. Train loss: 0.4023590900289391, Val loss: 0.3335779931852674
Epoch 30/500 done. Train loss: 0.3368603682415012, Val loss: 0.2779079500287303
Epoch 40/500 done. Train loss: 0.3023537242929158, Val loss: 0.24940889368148328
Epoch 50/500 done. Train loss: 0.2776805312650549, Val loss: 0.22881294471760288
Epoch 60/500 done. Train loss: 0.2575754850915566, Val loss: 0.21197445994272723
Epoch 70/500 done. Train loss: 0.24001553275105023, Val loss: 0.19786381134378087
Epoch 80/500 done. Train loss: 0.22399133010078218, Val loss: 0.18479183971776839
Epoch 90/500 done. Train loss: 0.20964426925082255, Val loss: 0.17389427256491619
Epoch 100/500 done. Train loss: 0.19654072057157781, Val loss: 0.16350662913021316
Epoch 110/500 done. Train loss: 0.18447714711238805, Val loss: 0.15485020124615118
Epoch 120/500 done. Train loss: 0.1735942680689455, Val loss: 0.14647707054447015
Epoch 130/500 done. Trai

In [10]:
#testing
num_correct = 0
num_samples = 0
for X_batch, y_batch in batch_generator(test_X, test_y, batch_size):
  z,a = model.forward(X_batch)
  predictions = np.argmax(a[3], axis = 1)
  num_correct += np.sum(predictions == y_batch)
  num_samples += y_batch.shape[0]

accuracy = num_correct/ num_samples
print(f"Model Accuracy: {accuracy*100:.2f}%")



Model Accuracy: 97.53%
