


# CS6910 Assignment-1

by
- Akansh Maurya (CS22Z003)
- Tejoram Vivekanandan (EE22Z001)

In [4]:
from keras.datasets import fashion_mnist
import numpy as np
from  matplotlib import pyplot as plt
import time
import math
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

### Question 1: Loading and ploting the dataset

In [5]:
dataset= fashion_mnist.load_data()
(X_train_and_validation, y_train_and_validation), (X_test, y_test) = dataset
X_train, X_validation, y_train, y_validation = train_test_split(X_train_and_validation, y_train_and_validation, test_size=0.1, random_state=42)
X_train = (X_train/255.0).astype(np.float32)
X_validation = (X_validation/255.0).astype(np.float32)
X_test = (X_test/255.0).astype(np.float32)

print("Train Dataset Shape: ", X_train.shape)
print("Train Target Vector Shape: ", y_train.shape) 
print("Test Dataset Shape:", X_test.shape)
print("Test Target Vector Shape", y_test.shape)
print("Validation Dataset Shape:", X_validation.shape)
print("Validation Target Vector Shape", y_validation.shape)




Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
Train Dataset Shape:  (54000, 28, 28)
Train Target Vector Shape:  (54000,)
Test Dataset Shape: (10000, 28, 28)
Test Target Vector Shape (10000,)
Validation Dataset Shape: (6000, 28, 28)
Validation Target Vector Shape (6000,)


In [6]:
X_train = np.array(X_train.reshape(X_train.shape[0], 784,1))         
X_test = np.array(X_test.reshape(X_test.shape[0], 784,1))
X_validation = np.array(X_validation.reshape(X_validation.shape[0], 784,1))

**Implement a feedforward and backpropagation**

In [7]:
#Activation function
def activation(activation_function):
  if activation_function == 'sigmoid':
    return sigmoid
  if activation_function == 'tanh':
    return tanh
  if activation_function == 'ReLU':
    return relu

def sigmoid(x, derivative = False):
  if derivative:
    return sigmoid(x)*(1-sigmoid(x))
  return 1/(1 + np.exp(-x))  

def tanh(x, derivative = False):
  if derivative:
    return 1 - tanh(x)**2
  return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))

def relu(x, derivative = False):
  if derivative:
    return (x>0)*1 
  return x*(x>0)

def softmax(x,derivative = False):
  if derivative:
    return softmax(x)*(1- softmax(x))
  return np.exp(x)/np.sum(np.exp(x), axis = 0)

def one_hot(y, num_output_nodes):
  v = np.zeros((num_output_nodes, len(y)))
  for i,j in enumerate(y):
    v[j,i] = 1
  return v

### Weight Initialization
def param_inint(num_inputs_nodes, hidden_layers, num_output_nodes, init_type):
  W = []
  B = []
  if init_type == "random":
    W.append(np.random.randn(hidden_layers[0],num_inputs_nodes)*0.1)
    B.append(np.random.randn(hidden_layers[0], 1)*0.1)
    for i in range(len(hidden_layers)-1):
      W.append(np.random.randn(hidden_layers[i+1],hidden_layers[i])*0.1)
      B.append(np.random.randn(hidden_layers[i+1], 1)*0.1)
    W.append(np.random.randn(num_output_nodes, hidden_layers[-1])*0.1)
    B.append(np.random.randn(num_output_nodes, 1)*0.1)
    return W, B

  if init_type == "xavier":
    W.append(np.random.randn(hidden_layers[0],num_inputs_nodes)*np.sqrt(2/(hidden_layers[0] + num_inputs_nodes)))
    B.append(np.random.randn(hidden_layers[0], 1)*0.1)
    for i in range(len(hidden_layers)-1):
      W.append(np.random.randn(hidden_layers[i+1],hidden_layers[i])*np.sqrt(2/(hidden_layers[i+1] + hidden_layers[i])))
      B.append(np.random.randn(hidden_layers[i+1], 1)*0.1)
    W.append(np.random.randn(num_output_nodes, hidden_layers[-1])*np.sqrt(2/(num_output_nodes + hidden_layers[-1])))
    B.append(np.random.randn(num_output_nodes, 1)*0.1)
    return W, B


def feed_forward(x, W, B, activation_type):
  h = []
  a = []
  sigma = activation(activation_type)  #activation
  h.append(x)   #h0 = x
  a.append(np.dot(W[0], h[0]) + B[0])
  for i in range(len(W)-1):
    h.append(sigma(a[-1]))
    a.append(np.dot(W[i+1], h[-1]) + B[i+1])
  y_hat = softmax(a[-1])

  return y_hat, h, a



def loss_compute(y,y_hat, loss_type, W, reg_lamda):
  if loss_type == "squared_error":
    error = np.sum((one_hot(y, 10)-y_hat)**2)/(2*one_hot(y, 10).shape[1])
  if loss_type == "cross_entropy":
    error = -1*np.sum(np.multiply(one_hot(y, 10),np.log(y_hat)))/one_hot(y, 10).shape[1]         # hardcoded classes = 10

  if W:
    r = 0
    for i in range(len(W)):
      r += np.sum((np.array(W, dtype = object) **2)[i])
    error = error + reg_lamda * r

  return error


def accuracy(y_hat, y_true):
  return np.mean(np.argmax(y_hat, axis = 0) ==y_true )*100


##### Back Propogation

In [8]:
def back_prop(x, y, y_hat, a, h , W, B, batch_size, loss_type, activation_type):
  gh = [0]*len(h)
  ga = [0]*len(a)
  gw = [0]*len(W)
  gb = [0]*len(B)

  sigma = activation(activation_type) 

  if loss_type == "cross_entropy":
    gh[-1] = -1*(y/y_hat)
    ga[-1] = -1*(y-y_hat)
  if loss_type == "squared_error":   ##### edit this
    gh[-1] = y_hat - y
    ga[-1] = (y_hat - y)*softmax(a[-1])*(1-softmax(a[-1]))

  for i in range(len(W)-1, -1, -1):
    gw[i] = np.dot(ga[i], h[i].T)
    gb[i] = np.dot(ga[i], np.ones((batch_size,1)))
    if i > 0:
      gh[i-1] = np.dot(W[i].T, ga[i])
      ga[i-1]  = np.multiply(gh[i-1],sigma(a[i-1], derivative = True))

  return gw, gb, gh, ga

**Optimizing functions**

In [9]:
class SGD:
  ''' Stochastic Gradient Descent '''
  def __init__(self, lr = 0.001, reg = 0):
    self.lr = lr
    self.reg = reg
  
  def update(self, w,b, gW, gB):
    W = np.array(w, dtype = object)
    B = np.array(b, dtype = object)

    W = (1-self.lr*self.reg)*W - self.lr * np.array(gW, dtype = object)
    B = (1-self.lr*self.reg)*B - self.lr * np.array(gB, dtype = object)

    return W.tolist(),B.tolist()


class Momentum:

  def __init__(self, lr = 0.001, gamma = 0.9, reg = 0):
    self.lr = lr
    self.gamma = gamma
    self.Wmoments = None
    self.Bmoments = None
    self.reg = reg


  def update(self, w,b, gW, gB):
    params = {'w':w, 'b':b}

    if self.Wmoments == None:
      self.Wmoments = [0] * len(params['w'])
      for i in range(len(params['w'])):
        self.Wmoments[i] = np.zeros_like(params['w'][i])

    if self.Bmoments == None:
      self.Bmoments = [0] * len(params['b'])
      for i in range(len(params['b'])):
        self.Bmoments[i] = np.zeros_like(params['b'][i])
    
    self.Wmoments = self.gamma * np.array(self.Wmoments, dtype = object) + self.lr * np.array(gW, dtype = object)
    W = (1-self.lr*self.reg)*np.array(params['w'], dtype = object) - self.Wmoments
    self.Wmoments = self.Wmoments.tolist()

    self.Bmoments = self.gamma * np.array(self.Bmoments, dtype = object) + self.lr * np.array(gB, dtype = object)
    B = (1-self.lr*self.reg)*np.array(params['b'], dtype = object) - self.Bmoments
    self.Bmoments = self.Bmoments.tolist()
    
    return W.tolist(), B.tolist()


class RMSprop:
  def __init__(self, lr=0.01, beta = 0.99):
    
    self.lr = lr
    self.vW = None
    self.vB = None
    self.beta = beta

  def update(self, w,b, gW, gB):
    params = {'w':w, 'b':b}
    if self.vW == None:
      self.vW = [0] * len(params['w'])
      for i in range(len(params['w'])):
        self.vW[i] = np.zeros_like(params['w'][i])

    if self.vB == None:
      self.vB = [0] * len(params['b'])
      for i in range(len(params['b'])):
        self.vB[i] = np.zeros_like(params['b'][i])

    self.vW = self.beta*np.array(self.vW, dtype = object) + (1-self.beta)*(np.array(gW, dtype = object) **2) 
    W = (1-self.lr*self.reg)*np.array(params['w'], dtype = object) - (self.lr/((self.vW + 1e-7)**0.5)) * np.array(gW, dtype = object)
    self.vW = self.vW.tolist()

    self.vB = self.beta*np.array(self.vB, dtype = object) + (1-self.beta)*(np.array(gB, dtype = object) **2 )
    B = (1-self.lr*self.reg)*np.array(params['b'], dtype = object) - (self.lr/((self.vB + 1e-7)**0.5)) * np.array(gB, dtype = object)
    self.vB = self.vB.tolist()

    return W.tolist(), B.tolist()

class Nesterov:   
  def __init__(self, lr=0.01, gamma=0.9):
    self.lr = lr
    self.reg = None
    self.gamma = gamma                                                             
    self.Wmoments = None
    self.Bmoments = None
    self.activation_type = None
    self.loss_type = None
        
  def update(self, w,b, gW, gB):
    params = {'w':w, 'b':b}
    if self.Wmoments == None:
      self.Wmoments = [0] * len(params['w'])
      for i in range(len(params['w'])):
        self.Wmoments[i] = np.zeros_like(params['w'][i])

    if self.Bmoments == None:
      self.Bmoments = [0] * len(params['b'])
      for i in range(len(params['b'])):
        self.Bmoments[i] = np.zeros_like(params['b'][i])


    W_look_ahead = np.array(params['w'], dtype = object) - self.gamma*np.array(self.Wmoments, dtype = object)
    B_look_ahead = np.array(params['b'], dtype = object) - self.gamma*np.array(self.Bmoments, dtype = object)
    ##
    out, temp_h, temp_a = feed_forward(x,W_look_ahead.tolist(),B_look_ahead.tolist(), self.activation_type)
    gW_look_ahead, gB_look_ahead, _, _ = back_prop(x, y,out,temp_a,temp_h, W_look_ahead.tolist(),B_look_ahead.tolist(), x.shape[1], self.loss_type, self.activation_type)

    ###
    self.Wmoments = self.gamma*np.array(self.Wmoments, dtype = object) + self.lr * np.array(gW_look_ahead, dtype = object)
    self.Bmoments = self.gamma*np.array(self.Bmoments, dtype = object) + self.lr * np.array(gB_look_ahead, dtype = object)

    W = (1-self.lr*self.reg)*np.array(params['w'], dtype = object) - self.Wmoments
    self.Wmoments = self.Wmoments.tolist()

    B = (1-self.lr*self.reg)*np.array(params['b'], dtype = object) - self.Bmoments
    self.Bmoments = self.Bmoments.tolist()

    return W.tolist(), B.tolist()

class Adam:
  def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, reg = 0):
    self.lr = lr
    self.beta1 = beta1
    self.beta2 = beta2
    self.t = 0
    self.mW = None
    self.vW = None
    self.mB = None
    self.vB = None
    self.reg = None
        
  def update(self, w,b, gW, gB):
    params = {'w':w, 'b':b}

    if self.mW is None:
      self.mW, self.vW = [0] * len(params['w']), [0] * len(params['w'])
      for i in range(len(params['w'])):
        self.mW[i] = np.zeros_like(params['w'][i])
        self.vW[i] = np.zeros_like(params['w'][i])

    if self.mB is None:
      self.mB, self.vB = [0] * len(params['b']), [0] * len(params['b'])
      for i in range(len(params['b'])):
        self.mB[i] = np.zeros_like(params['b'][i])
        self.vB[i] = np.zeros_like(params['b'][i])
    

    self.t += 1
    self.mW = (self.beta1 * np.array(self.mW, dtype = object)) + (1-self.beta1)*(np.array(gW, dtype = object))
    self.vW = (self.beta2 * np.array(self.vW, dtype = object)) + (1-self.beta2)*((np.array(gW, dtype = object)**2))

    self.mB = (self.beta1 * np.array(self.mB, dtype = object)) + (1-self.beta1)*(np.array(gB, dtype = object))
    self.vB = (self.beta2 * np.array(self.vB, dtype = object)) + (1-self.beta2)*((np.array(gB, dtype = object)**2))

    # Bias Correction
    self.mW = (self.mW)*(1.0/(1-(self.beta1**self.t)))
    self.vW = (self.vW)*(1.0/(1-(self.beta2**self.t)))
    self.mB = (self.mB)*(1.0/(1-(self.beta1**self.t)))
    self.vB = (self.vB)*(1.0/(1-(self.beta2**self.t)))

    W = (1-self.lr*self.reg)*np.array(params['w'], dtype = object) - (self.lr/((self.vW + 1e-7)**0.5)) * self.mW
    self.vW = self.vW.tolist()
    self.mW = self.mW.tolist()

    B = (1-self.lr*self.reg)*np.array(params['b'], dtype = object) - (self.lr/((self.vB + 1e-7)**0.5)) * self.mB
    self.vB = self.vB.tolist()
    self.mB = self.mB.tolist()

    return W.tolist(), B.tolist()    

##### Training Function

In [10]:
def train(X_train, y_train,x_val, y_val, num_inputs_nodes, hidden_layers, num_output_nodes, init_type, epochs, batch_size, loss_type,activation_type, optimizer_name, learning_rate, reg_lamda):
  if optimizer_name=='sgd':
    optimizer = SGD()
  elif optimizer_name=='momentum':
    optimizer = Momentum()
  elif optimizer_name=='rmsprop':
    optimizer = RMSprop()
  elif optimizer_name=='nesterov':
    optimizer = Nesterov()
  elif optimizer_name=='adam':
    optimizer = Adam()    
  

  try:   
    optimizer.activation_type = activation_type
    optimizer.loss_type = loss_type
  except:
    pass

  W, B = param_inint(num_inputs_nodes,hidden_layers, num_output_nodes, init_type)
  N = X_train.shape[0]
  n_batches = int(np.floor(N/batch_size))
  optimizer.lr = learning_rate
  optimizer.reg = reg_lamda

  for epoch in range(epochs):

    train_loss = []
    train_accuracy = []
    val_loss = []
    val_accuracy = []
    l = 0
    acc = 0
    temp = 0
    for batch in range(n_batches):
      x = np.squeeze(X_train[batch*batch_size:batch_size+batch*batch_size]).T
      y = one_hot(y_train[batch*batch_size:batch_size+batch*batch_size], 10)
      y_hat, h, a = feed_forward(x, W,B, activation_type)
      gw, gb, gh, ga = back_prop(x, y,y_hat,a,h, W,B, batch_size, loss_type, activation_type)
      W,B = optimizer.update(W,B, gw,gb)
      l += loss_compute(y_train[batch*batch_size:batch_size+batch*batch_size],y_hat, loss_type, W,reg_lamda)
      acc += accuracy(y_hat, y_train[batch*batch_size:batch_size+batch*batch_size])

    if N%batch_size != 0:
        x = np.squeeze(X_train[-1*(N%batch_size):]).T
        y = one_hot(y_train[-1*(N%batch_size):], 10)
        y_hat, h, a = feed_forward(x, W,B, activation_type)
        gw, gb, gh, ga = back_prop(x, y,y_hat,a,h, W,B, N%batch_size, loss_type, activation_type)
        W,B = optimizer.update(W,B, gw,gb)
        l += loss_compute(y_train[-1*(N%batch_size):],y_hat, loss_type, W,reg_lamda)
        acc += accuracy(y_hat, y_train[-1*(N%batch_size):])
        temp = 1

    l = l/(n_batches + (N%batch_size))
    acc = acc/(n_batches + temp)

    train_loss.append(l)
    train_accuracy.append(acc)
    #print(f"Epoch:{epoch+1}")
    #print(f"Train Loss: {l}")
    #print(f"Train Accuracy: {acc}")

    #### Validation
    if x_val.any():
      y_val_hat, _,_ = feed_forward(np.squeeze(x_val).T, W,B, activation_type)
      val_acc = accuracy(y_val_hat,y_val)
      val_l = loss_compute(y_val, y_val_hat, loss_type,W = None, reg_lamda = reg_lamda)
      val_accuracy.append(val_acc)
      val_loss.append(val_l)
      #print(f"Val Loss: {val_l}")
      #print(f"Val Accuracy: {val_acc}")

    wandb.log({"epoch":epoch,"Train_loss":l,"Train_acc":acc,"val_loss":val_l,"val_Accuracy":val_acc})
  return W,B, train_loss, train_accuracy, val_loss, val_accuracy




In [15]:
!pip install wandb
!wandb login
import wandb

[34m[1mwandb[0m: Currently logged in as: [33mtejoram[0m (use `wandb login --relogin` to force relogin)


In [12]:
sweep_configuration = {'method'    : "random",  #Other available--> grid, bayes
                       'metric'    : {'name': 'val_Accuracy','goal':'maximize'},
                       'parameters': {'epochs':{'values':[5]},
                                      'hidden_layers':{'values':[[64,32]] }, #change needed
                                      'learning_rate':{'values':[1e-3]},
                                      'weight_decay':{'values':[ 0.0005]},
                                      'optimizer_name': {'values':['sgd']}, #, 'nesterov','adam', 'nadam'
                                      'batch_size':{'values':[16]},
                                      'init_type': {'values':['xavier']},
                                      'activation_type':{'values':['sigmoid']},
                                      'loss_type': {'values':['cross_entropy','squared_error']} }}

In [16]:
def sweep_train():

  hyperparameters=dict(epochs = 5,
                      hidden_layers= [64,32],
                      learning_rate=1e-4,
                      weight_decay=0,
                      optimizer_name='sgd',
                      batch_size=16,
                      init_type='random',
                      activation_type='sigmoid',
                      loss_type='cross_entropy',
                      reg_lamda=0)
                           
  wandb.init(project="CS6910-Assignment-1", entity="tejoram",config=hyperparameters)
  config=wandb.config
  epochs=config.epochs
  hidden_layers=config.hidden_layers
  learning_rate=config.learning_rate
  weight_decay=config.weight_decay
  optimizer_name=config.optimizer_name
  batch_size=config.batch_size
  init_type=config.init_type
  activation_type=config.activation_type
  loss_type=config.loss_type
  reg_lamda=config.reg_lamda  
  wandb.run.name = "e_{}_hl_{}_lr_{}_wd_{}_o_{}_bs_{}_winit_{}_ac_{}_los_{}_r_{}".format(epochs,\
                                                                                    hidden_layers,\
                                                                                    learning_rate,\
                                                                                    weight_decay,\
                                                                                    optimizer_name,\
                                                                                    batch_size,\
                                                                                    init_type,\
                                                                                    activation_type,\
                                                                                    loss_type,\
                                                                                    reg_lamda)
  
  _,_,train_loss, train_accuracy, val_loss, val_accuracy = train(X_train, y_train, X_validation, y_validation, 784, hidden_layers, 10, init_type, epochs, batch_size, loss_type, activation_type, optimizer_name, learning_rate, reg_lamda)

In [17]:
sweep_id = wandb.sweep(sweep_configuration,project='CE_new')
wandb.agent(sweep_id,function=sweep_train,project='CE_new',count=2)

Create sweep with ID: 8s9im2lk
Sweep URL: https://wandb.ai/tejoram/CE_new/sweeps/8s9im2lk


[34m[1mwandb[0m: Agent Starting Run: qk046pxn with config:
[34m[1mwandb[0m: 	activation_type: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: [64, 32]
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_type: cross_entropy
[34m[1mwandb[0m: 	optimizer_name: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_acc,▁▅▇▇█
Train_loss,█▃▂▁▁
epoch,▁▃▅▆█
val_Accuracy,▁▄▆▇█
val_loss,█▄▂▂▁

0,1
Train_acc,81.35185
Train_loss,0.53006
epoch,4.0
val_Accuracy,81.66667
val_loss,0.51127


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: afin1h9k with config:
[34m[1mwandb[0m: 	activation_type: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: [64, 32]
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_type: squared_error
[34m[1mwandb[0m: 	optimizer_name: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_acc,▁▁▂▆█
Train_loss,█▇▅▃▁
epoch,▁▃▅▆█
val_Accuracy,▂▁▄▇█
val_loss,█▇▅▂▁

0,1
Train_acc,50.50556
Train_loss,0.35086
epoch,4.0
val_Accuracy,52.1
val_loss,0.34399
