<a href="https://colab.research.google.com/github/ShreyaC15/Optimizers/blob/main/MLP_SGD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IST597:- Multi-Layer Perceptron - SGD

## Load the libraries

In [None]:
import os
import numpy as np
import time
import tensorflow as tf
import matplotlib.pyplot as plt
seeds=2785                         # Update seeds
np.random.seed(seeds)
tf.random.set_seed(seeds)

In [None]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Load data and perform pre-processing

In [None]:
#(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

In [None]:
X_train = tf.cast(tf.reshape(X_train, (-1, X_train.shape[1]*X_train.shape[2])), dtype=tf.float32)
X_test = tf.cast(tf.reshape(X_test, (-1, X_test.shape[1]*X_test.shape[2])), dtype=tf.float32)

In [None]:
X_train = X_train/255.0
X_test = X_test/255.0

In [None]:
x_val = X_train[-10000:]
y_val = y_train[-10000:]

In [None]:
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)
y_val=tf.keras.utils.to_categorical(y_val)

In [None]:
# Split dataset into batches
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(100)
validate_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(100)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(16)

In [None]:
label_size = y_train.shape[1]

## Build MLP using Eager Execution

In [None]:
# Define class to build mlp model
class MLP(object):
  def __init__(self, size_input, size_hidden1,size_hidden2, size_output, device=None):
    """
    size_input: int, size of input layer
    size_hidden: int, size of hidden layer
    size_output: int, size of output layer
    device: str or None, either 'cpu' or 'gpu' or None. If None, the device to be used will be decided automatically during Eager Execution
    """
    self.size_input, self.size_hidden1, self.size_hidden2, self.size_output, self.device =\
    size_input, size_hidden1,size_hidden2, size_output, device
    
    # Initialize weights between input layer and hidden layer
    self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1],stddev=0.1))
    # Initialize biases for hidden layer
    self.b1 = tf.Variable(tf.random.normal([1, self.size_hidden1]))

    # Initialize weights between hidden 1 and hidden layer 2
    self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2],stddev=0.1))
    # Initialize biases for hidden layer 2
    self.b2 = tf.Variable(tf.random.normal([1, self.size_hidden2]))

    self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output],stddev=0.1))
    # Initialize biases for output layer
    self.b3 = tf.Variable(tf.random.normal([1, self.size_output]))
    
    # Define variables to be updated during backpropagation
    self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]    

  def forward(self, X):
    """
    forward pass
    X: Tensor, inputs
    """
    if self.device is not None:
      with tf.device('gpu:0' if self.device=='gpu' else 'cpu'):
        self.y = self.compute_output(X)
    else:
      self.y = self.compute_output(X)
    return self.y
  
  def loss(self, y_pred, y_true):
    """
    y_pred - Tensor of shape (batch_size, size_output)
    y_true - Tensor of shape (batch_size, size_output)
    """

    #y_pred is the softmax value
    y_true_tf = tf.cast(y_true, dtype=tf.float32)
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    return tf.keras.losses.CategoricalCrossentropy()(y_true_tf, y_pred_tf)

  def backward(self, X_train, y_train):
    """
    Backward pass
    """
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
    with tf.GradientTape() as tape:
      predicted = self.forward(X_train)
      current_loss = self.loss(predicted, y_train)
    grads = tape.gradient(current_loss, self.variables)
    optimizer.apply_gradients(zip(grads, self.variables))

  def compute_output(self, X):
    """
    Custom method to obtain output tensor during forward pass
    """
    # Cast X to float32
    X_tf = tf.cast(X, dtype=tf.float32)
    # Compute values in hidden layer
    what1 = tf.matmul(X_tf, self.W1) + self.b1
    hhat1 = tf.nn.relu(what1)
    what2 = tf.matmul(hhat1, self.W2) + self.b2
    hhat2 = tf.nn.relu(what2)
    # Compute output
    output = tf.matmul(hhat2, self.W3) + self.b3
    return tf.nn.softmax(output)

  def var(self,y_pred):
    """
    Calculate variance 
    """
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    std_dev = np.std(y_pred_tf) #Calculates standard deviation
    variance = (std_dev**2) # calculate variance
    return variance


In [None]:
# L2 regularization
# Define class to build mlp model
class MLP(object):
  def __init__(self, size_input, size_hidden1,size_hidden2, size_output, device=None):
    """
    size_input: int, size of input layer
    size_hidden: int, size of hidden layer
    size_output: int, size of output layer
    device: str or None, either 'cpu' or 'gpu' or None. If None, the device to be used will be decided automatically during Eager Execution
    """
    self.size_input, self.size_hidden1, self.size_hidden2, self.size_output, self.device =\
    size_input, size_hidden1,size_hidden2, size_output, device
    
    # Initialize weights between input layer and hidden layer
    self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1],stddev=0.1))
    # Initialize biases for hidden layer
    self.b1 = tf.Variable(tf.random.normal([1, self.size_hidden1]))

    # Initialize weights between hidden 1 and hidden layer 2
    self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2],stddev=0.1))
    # Initialize biases for hidden layer 2
    self.b2 = tf.Variable(tf.random.normal([1, self.size_hidden2]))

    self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output],stddev=0.1))
    # Initialize biases for output layer
    self.b3 = tf.Variable(tf.random.normal([1, self.size_output]))
    
    # Define variables to be updated during backpropagation
    self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]    

  def forward(self, X):
    """
    forward pass
    X: Tensor, inputs
    """
    if self.device is not None:
      with tf.device('gpu:0' if self.device=='gpu' else 'cpu'):
        self.y = self.compute_output(X)
    else:
      self.y = self.compute_output(X)
    return self.y
  
  def loss(self, y_pred, y_true):
    """
    y_pred - Tensor of shape (batch_size, size_output)
    y_true - Tensor of shape (batch_size, size_output)
    """

    #y_pred is the softmax value
    y_true_tf = tf.cast(y_true, dtype=tf.float32)
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    return tf.keras.losses.CategoricalCrossentropy()(y_true_tf, y_pred_tf) + tf.reduce_mean((self.W1)**2) + tf.reduce_mean((self.W2)**2) + tf.reduce_mean((self.W3)**2)

  def backward(self, X_train, y_train):
    """
    Backward pass
    """
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
    with tf.GradientTape() as tape:
      predicted = self.forward(X_train)
      current_loss = self.loss(predicted, y_train)
    grads = tape.gradient(current_loss, self.variables)
    optimizer.apply_gradients(zip(grads, self.variables))

  def compute_output(self, X):
    """
    Custom method to obtain output tensor during forward pass
    """
    # Cast X to float32
    X_tf = tf.cast(X, dtype=tf.float32)
    # Compute values in hidden layer
    what1 = tf.matmul(X_tf, self.W1) + self.b1
    hhat1 = tf.nn.relu(what1)
    what2 = tf.matmul(hhat1, self.W2) + self.b2
    hhat2 = tf.nn.relu(what2)
    # Compute output
    output = tf.matmul(hhat2, self.W3) + self.b3
    return tf.nn.softmax(output)

  def var(self,y_pred):
    """
    Calculate variance 
    """
    y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
    std_dev = np.std(y_pred_tf) #Calculates standard deviation
    variance = (std_dev**2) # calculate variance
    return variance


## Train Model

In [None]:
# Set number of epochs
NUM_EPOCHS = 10

In [None]:
# Initialize model using GPU
mlp_on_gpu = MLP(X_train.shape[1], 256,128, label_size, device='gpu')

train_loss = []
val_loss=[]
seed_ = []
train_accuracy_ = []
train_var_ =[]
val_accuracy_ = []
val_var_ =[]
time_start = time.time()

for epoch in range(NUM_EPOCHS):
  val_loss_total = tf.zeros([1,1], dtype=tf.float32)
  loss_total_gpu = tf.zeros([1,1], dtype=tf.float32)

  train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(100, seed=epoch*(seeds)).batch(100)
  validate_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val)).shuffle(100, seed=epoch*(seeds)).batch(100)

  #creating y_train and y_val after each shuffled data above
  for inputs, outputs in train_ds:
    preds = mlp_on_gpu.forward(inputs)
    loss_total_gpu = loss_total_gpu + mlp_on_gpu.loss(preds, outputs)
    mlp_on_gpu.backward(inputs, outputs)

  y_batch_train = y_train
  result=mlp_on_gpu.forward(X_train)
  correct_prediction = tf.equal(tf.round(y_batch_train,1), tf.round(result,1))
  train_accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
  train_accuracy_.append(train_accuracy*100)
  #train_var = mlp_on_gpu.var(correct_prediction)
  #train_var_.append(train_var)

  for inputs, outputs in validate_ds:
    preds = mlp_on_gpu.forward(inputs)
    val_loss_total = val_loss_total + mlp_on_gpu.loss(preds, outputs)
    
  y_batch_val = y_val
  val_result = mlp_on_gpu.forward(x_val)
  correct_val_prediction = tf.equal(tf.round(y_batch_val,1), tf.round(val_result,1))
  validation_accuracy = tf.reduce_mean(tf.cast(correct_val_prediction, "float"))
  val_accuracy_.append(validation_accuracy*100)
  #val_var = mlp_on_gpu.var(correct_val_prediction)
  #val_var_.append(val_var)

  print('Number of Epoch = {} - Average train CCE:= {}'.format(epoch + 1, np.sum(loss_total_gpu) / X_train.shape[0]))
  print('Number of Epoch = {} - Average val CCE:= {}'.format(epoch + 1, np.sum(val_loss_total) / x_val.shape[0]))
  print()
  print('Train Accuracy = {}'.format(train_accuracy*100))
  print('Val Accuracy = {}'.format(validation_accuracy*100))
  print()
  #print('Train Variance = {}'.format(train_var))
  #print('Val Variance = {}'.format(val_var))
  print("================================================================")

  train_loss.append(float(loss_total_gpu))
  val_loss.append(float(val_loss_total))
  seed_.append((epoch+1)*(2785))
  
time_taken = time.time() - time_start
print('\nTotal time taken (in seconds): {:.2f}'.format(time_taken))

Number of Epoch = 1 - Average train CCE:= 0.006513658142089844
Number of Epoch = 1 - Average val CCE:= 0.0047765903472900394

Train Accuracy = 96.90766906738281
Val Accuracy = 96.88999938964844

Number of Epoch = 2 - Average train CCE:= 0.00458621826171875
Number of Epoch = 2 - Average val CCE:= 0.004121645736694336

Train Accuracy = 97.36149597167969
Val Accuracy = 97.37899780273438

Number of Epoch = 3 - Average train CCE:= 0.004177086385091146
Number of Epoch = 3 - Average val CCE:= 0.004019518280029297

Train Accuracy = 97.39266967773438
Val Accuracy = 97.39099884033203

Number of Epoch = 4 - Average train CCE:= 0.003913692982991536
Number of Epoch = 4 - Average val CCE:= 0.0036408817291259765

Train Accuracy = 97.66316223144531
Val Accuracy = 97.6760025024414

Number of Epoch = 5 - Average train CCE:= 0.003739074452718099
Number of Epoch = 5 - Average val CCE:= 0.0034663658142089844

Train Accuracy = 97.78166198730469
Val Accuracy = 97.822998046875

Number of Epoch = 6 - Average t

## One Step Inference

In [None]:
test_loss_total =tf.zeros([1,1], dtype=tf.float32)
for inputs, outputs in test_ds:
  preds = mlp_on_gpu.forward(inputs)
  test_loss_total = test_loss_total + mlp_on_gpu.loss(preds, outputs)
test_result = mlp_on_gpu.forward(X_test)
test_batch = y_test
  
correct_test_prediction = tf.equal(tf.round(test_batch), tf.round(test_result))
test_accuracy = tf.reduce_mean(tf.cast(correct_test_prediction, "float"))
#test_variance = mlp_on_gpu.var(correct_test_prediction)
print('Test Accuracy = {}'.format(test_accuracy*100))
print('Test Average CCE: {:.4f}'.format(np.sum(test_loss_total.numpy()) / X_test.shape[0]))
#print('Test Variance = {}'.format(test_variance*100))

Test Accuracy = 97.56600189208984
Test Average CCE: 0.0245
