<a href="https://colab.research.google.com/github/ShreyaC15/Multilayer-Perceptron/blob/main/MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IST597:- Multi-Layer Perceptron

## Load the libraries

In [None]:
import os
import numpy as np
import time
import tensorflow as tf
import matplotlib.pyplot as plt
seeds=2785                         # Update seeds
np.random.seed(seeds)
tf.random.set_seed(seeds)

In [None]:
tf.config.list_physical_devices('GPU')

## Load data and perform pre-processing

In [None]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
# (X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

In [None]:
X_train = tf.cast(tf.reshape(X_train, (-1, X_train.shape[1]*X_train.shape[2])), dtype=tf.float32)
X_test = tf.cast(tf.reshape(X_test, (-1, X_test.shape[1]*X_test.shape[2])), dtype=tf.float32)

In [None]:
X_train = X_train/255.0
X_test = X_test/255.0

In [None]:
x_val = X_train[-10000:]
y_val = y_train[-10000:]

In [None]:
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)
y_val=tf.keras.utils.to_categorical(y_val)

In [None]:
# Split dataset into batches
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(100)
validate_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(100)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(16)

In [None]:
label_size = y_train.shape[1]

## Build MLP using Eager Execution

In [None]:
# Define class to build mlp model
class MLP(object):
  def __init__(self, size_input, size_hidden1,size_hidden2, size_output, device=None):
    """
    size_input: int, size of input layer
    size_hidden: int, size of hidden layer
    size_output: int, size of output layer
    device: str or None, either 'cpu' or 'gpu' or None. If None, the device to be used will be decided automatically during Eager Execution
    """
    self.size_input, self.size_hidden1, self.size_hidden2, self.size_output, self.device =\
    size_input, size_hidden1,size_hidden2, size_output, device
    
    # Initialize weights between input layer and hidden layer
    self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1]))
    # Initialize biases for hidden layer
    self.b1 = tf.Variable(tf.random.normal([1, self.size_hidden1]))

    # Initialize weights between hidden 1 and hidden layer 2
    self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2]))
    # Initialize biases for hidden layer 2
    self.b2 = tf.Variable(tf.random.normal([1, self.size_hidden2]))

    self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output]))
    # Initialize biases for output layer
    self.b3 = tf.Variable(tf.random.normal([1, self.size_output]))
    
    # Define variables to be updated during backpropagation
    self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]
    #self.variables = [self.W1, self.b1, self.W2, self.b2, self.W3, self.b3]
    

  def forward(self, X):
    """
    forward pass
    X: Tensor, inputs
    """
    if self.device is not None:
      with tf.device('gpu:0' if self.device=='gpu' else 'cpu'):
        self.y = self.compute_output(X)
    else:
      self.y = self.compute_output(X)
    return self.y
  
  def loss(self, y_pred, y_true):
    """
    y_pred - Tensor of shape (batch_size, size_output)
    y_true - Tensor of shape (batch_size, size_output)
    """
    
    #y_pred is the softmax value
    y_true_tf = tf.cast(y_true, dtype=tf.float32)
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    return tf.keras.losses.CategoricalCrossentropy()(y_true_tf, y_pred_tf)

  def backward(self, X_train, y_train):
    """
    backward pass
    """
    optimizer = tf.keras.optimizers.SGD(learning_rate=1e-5)
    with tf.GradientTape() as tape:
      predicted = self.forward(X_train)
      current_loss = self.loss(predicted, y_train)
    grads = tape.gradient(current_loss, self.variables)
    optimizer.apply_gradients(zip(grads, self.variables))
        
  def compute_output(self, X):
    """
    Custom method to obtain output tensor during forward pass
    """
    # Cast X to float32
    X_tf = tf.cast(X, dtype=tf.float32)
    # Compute values in hidden layer
    what1 = tf.matmul(X_tf, self.W1) + self.b1
    hhat1 = tf.nn.relu(what1)
    what2 = tf.matmul(hhat1, self.W2) + self.b2
    hhat2 = tf.nn.relu(what2)
    # Compute output
    output = tf.matmul(hhat2, self.W3) + self.b3
    return tf.nn.softmax(output)


In [None]:
#L1 regularization
# Define class to build mlp model
class MLP(object):
  def __init__(self, size_input, size_hidden1,size_hidden2, size_output, device=None):
    """
    size_input: int, size of input layer
    size_hidden: int, size of hidden layer
    size_output: int, size of output layer
    device: str or None, either 'cpu' or 'gpu' or None. If None, the device to be used will be decided automatically during Eager Execution
    """
    self.size_input, self.size_hidden1, self.size_hidden2, self.size_output, self.device =\
    size_input, size_hidden1,size_hidden2, size_output, device
    
    # Initialize weights between input layer and hidden layer
    self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1]))
    # Initialize biases for hidden layer
    self.b1 = tf.Variable(tf.random.normal([1, self.size_hidden1]))

    # Initialize weights between hidden 1 and hidden layer 2
    self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2]))
    # Initialize biases for hidden layer 2
    self.b2 = tf.Variable(tf.random.normal([1, self.size_hidden2]))

    self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output]))
    # Initialize biases for output layer
    self.b3 = tf.Variable(tf.random.normal([1, self.size_output]))
    
    # Define variables to be updated during backpropagation
    self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]
    

  def forward(self, X):
    """
    forward pass
    X: Tensor, inputs
    """
    if self.device is not None:
      with tf.device('gpu:0' if self.device=='gpu' else 'cpu'):
        self.y = self.compute_output(X)
    else:
      self.y = self.compute_output(X)
    return self.y
  
  def loss(self, y_pred, y_true):
    """
    y_pred - Tensor of shape (batch_size, size_output)
    y_true - Tensor of shape (batch_size, size_output)
    """

    #y_pred is the softmax value
    y_true_tf = tf.cast(y_true, dtype=tf.float32)
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    return tf.keras.losses.CategoricalCrossentropy()(y_true_tf, y_pred_tf) + tf.reduce_mean(abs(self.W1)) + tf.reduce_mean(abs(self.W2))+tf.reduce_mean(abs(self.W3))

  def backward(self, X_train, y_train):
    """
    backward pass
    """
    optimizer = tf.keras.optimizers.SGD(learning_rate=1e-5)
    with tf.GradientTape() as tape:
      predicted = self.forward(X_train)
      current_loss = self.loss(predicted, y_train)
    grads = tape.gradient(current_loss, self.variables)
    optimizer.apply_gradients(zip(grads, self.variables))
        
  def compute_output(self, X):
    """
    Custom method to obtain output tensor during forward pass
    """
    # Cast X to float32
    X_tf = tf.cast(X, dtype=tf.float32)
    # Compute values in hidden layer
    what1 = tf.matmul(X_tf, self.W1) + self.b1
    hhat1 = tf.nn.relu(what1)
    what2 = tf.matmul(hhat1, self.W2) + self.b2
    hhat2 = tf.nn.relu(what2)
    # Compute output
    output = tf.matmul(hhat2, self.W3) + self.b3
    return tf.nn.softmax(output)

In [None]:
#L2 regularization
# Define class to build mlp model
class MLP(object):
  def __init__(self, size_input, size_hidden1,size_hidden2, size_output, device=None):
    """
    size_input: int, size of input layer
    size_hidden: int, size of hidden layer
    size_output: int, size of output layer
    device: str or None, either 'cpu' or 'gpu' or None. If None, the device to be used will be decided automatically during Eager Execution
    """
    self.size_input, self.size_hidden1, self.size_hidden2, self.size_output, self.device =\
    size_input, size_hidden1,size_hidden2, size_output, device
    
    # Initialize weights between input layer and hidden layer
    self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1]))
    # Initialize biases for hidden layer
    self.b1 = tf.Variable(tf.random.normal([1, self.size_hidden1]))

    # Initialize weights between hidden 1 and hidden layer 2
    self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2]))
    # Initialize biases for hidden layer 2
    self.b2 = tf.Variable(tf.random.normal([1, self.size_hidden2]))

    self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output]))
    # Initialize biases for output layer
    self.b3 = tf.Variable(tf.random.normal([1, self.size_output]))
    
    # Define variables to be updated during backpropagation
    self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]
    

  def forward(self, X):
    """
    forward pass
    X: Tensor, inputs
    """
    if self.device is not None:
      with tf.device('gpu:0' if self.device=='gpu' else 'cpu'):
        self.y = self.compute_output(X)
    else:
      self.y = self.compute_output(X)
    return self.y
  
  def loss(self, y_pred, y_true):
    """
    y_pred - Tensor of shape (batch_size, size_output)
    y_true - Tensor of shape (batch_size, size_output)
    """

    #y_pred is the softmax value
    y_true_tf = tf.cast(y_true, dtype=tf.float32)
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    return tf.keras.losses.CategoricalCrossentropy()(y_true_tf, y_pred_tf) + tf.reduce_mean((self.W1)**2) + tf.reduce_mean((self.W2)**2) + tf.reduce_mean((self.W3)**2)

  def backward(self, X_train, y_train):
    """
    backward pass
    """
    optimizer = tf.keras.optimizers.SGD(learning_rate=1e-5)
    with tf.GradientTape() as tape:
      predicted = self.forward(X_train)
      current_loss = self.loss(predicted, y_train)
    grads = tape.gradient(current_loss, self.variables)
    optimizer.apply_gradients(zip(grads, self.variables))
        
  def compute_output(self, X):
    """
    Custom method to obtain output tensor during forward pass
    """
    # Cast X to float32
    X_tf = tf.cast(X, dtype=tf.float32)
    # Compute values in hidden layer
    what1 = tf.matmul(X_tf, self.W1) + self.b1
    hhat1 = tf.nn.relu(what1)
    what2 = tf.matmul(hhat1, self.W2) + self.b2
    hhat2 = tf.nn.relu(what2)
    # Compute output
    output = tf.matmul(hhat2, self.W3) + self.b3
    return tf.nn.softmax(output)

## Train Model

In [None]:
# Set number of epochs
NUM_EPOCHS = 10

In [None]:
# Initialize model using GPU
mlp_on_gpu = MLP(X_train.shape[1], 256,128, label_size, device='gpu')

train_loss = []
val_loss=[]
seed_ = []
train_accuracy_ = []
val_accuracy_ = []
time_start = time.time()

for epoch in range(NUM_EPOCHS):
  result = tf.reshape(tf.convert_to_tensor(()), (0, label_size))
  val_result = tf.reshape(tf.convert_to_tensor(()), (0, label_size))
  val_loss_total = tf.Variable(0, dtype=tf.float32)
  loss_total_gpu = tf.zeros([1,1], dtype=tf.float32)

  train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(100, seed=epoch*(seeds)).batch(100)
  validate_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val)).shuffle(100, seed=epoch*(seeds)).batch(100)

  #creating y_train and y_val after each shuffled data above
  y_batch_train=tf.reshape(tf.convert_to_tensor(()), (0, label_size))
  y_batch_val=tf.reshape(tf.convert_to_tensor(()), (0, label_size))

  for inputs, outputs in train_ds:
    preds = mlp_on_gpu.forward(inputs)
    loss_total_gpu = loss_total_gpu + mlp_on_gpu.loss(preds, outputs)
    y_pred=mlp_on_gpu.forward(inputs)
    mlp_on_gpu.backward(inputs, outputs)
    result = tf.concat([result, y_pred], 0)
    y_batch_train = tf.concat([y_batch_train, outputs], 0)

  correct_prediction = tf.equal(tf.round(y_batch_train), tf.round(result))
  train_accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  train_accuracy_.append(train_accuracy*100)

  for inputs, outputs in validate_ds:
    preds = mlp_on_gpu.forward(inputs)
    val_loss_total = val_loss_total + mlp_on_gpu.loss(preds, outputs)
    val_result = tf.concat([val_result, preds], 0)
    y_batch_val = tf.concat([y_batch_val, outputs], 0)

  correct_val_prediction = tf.equal(tf.round(y_batch_val), tf.round(val_result))
  validation_accuracy = tf.reduce_mean(tf.cast(correct_val_prediction, tf.float32))
  val_accuracy_.append(validation_accuracy*100)

  print('Number of Epoch = {} - Average train CCE:= {}'.format(epoch + 1, np.sum(loss_total_gpu) / X_train.shape[0]))
  print('Number of Epoch = {} - Average val CCE:= {}'.format(epoch + 1, np.sum(val_loss_total) / x_val.shape[0]))
  print()
  print('Train Accuracy = {}'.format(train_accuracy*100))
  print('val Accuracy = {}'.format(validation_accuracy*100))
  print("================================================================")

  train_loss.append(float(loss_total_gpu))
  val_loss.append(float(val_loss_total))
  seed_.append((epoch+1)*(2785))
  
time_taken = time.time() - time_start
print('\nTotal time taken (in seconds): {:.2f}'.format(time_taken))

## One Step Inference

In [None]:
test_loss_total = tf.Variable(0, dtype=tf.float32)
test_result = tf.reshape(tf.convert_to_tensor(()), (0, label_size))
test_batch = tf.reshape(tf.convert_to_tensor(()), (0, label_size))
for inputs, outputs in test_ds:
  preds = mlp_on_gpu.forward(inputs)
  test_loss_total = test_loss_total + mlp_on_gpu.loss(preds, outputs)
  test_result = tf.concat([test_result, preds], 0)
  test_batch = tf.concat([test_batch,outputs], 0)
  
correct_test_prediction = tf.equal(tf.round(test_batch), tf.round(test_result))
test_accuracy = tf.reduce_mean(tf.cast(correct_test_prediction, tf.float32))
print('Test Accuracy = {}'.format(test_accuracy*100))
print('Test Average CCE: {:.4f}'.format(np.sum(test_loss_total.numpy()) / X_test.shape[0]))

In [None]:
#Test Accuracy vs seed
#seed_new: updating seeds 10 times
#testAcc: Test accuracy over 10 inferences
kludge = 0.25
plt.figure(1)
plt.plot(seed_new, testAcc)
plt.xlabel("seed")
plt.ylabel("Test Accuracy")
plt.xlim((np.amin(seed_new) - kludge, np.amax(seed_new) + kludge))
plt.ylim((np.amin(testAcc) - kludge, np.amax(testAcc) + kludge))


In [None]:
#Test error vs seed
#testErr: Test error over 10 inferences
kludge = 0.25
plt.figure(1)
plt.plot(seed_new, testErr)
plt.xlabel("seed")
plt.ylabel("Test error")
plt.xlim((np.amin(seed_new) - kludge, np.amax(seed_new) + kludge))
plt.ylim((np.amin(testErr) - kludge, np.amax(testErr) + kludge))
