Q1. Plotting one sample image for each class!!

In [None]:
#Installing the wandb library
!pip install wandb -qqq
import wandb
from wandb.keras import WandbCallback

In [None]:
wandb.login()

In [None]:
import wandb
wandb.init(project="CS6910_DL_Assignment1", entity="nomads")

In [None]:
#Necessary Packages

import numpy as np
import math
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from keras.datasets import fashion_mnist
import matplotlib.pyplot as plt

In [None]:
#Downloading the Mnist_fashion data

(X, Y), (X_test, Y_test) = fashion_mnist.load_data()


In [None]:
#Assign the tags for all the 10 classes
Tags=['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle boot']
Images=[]
for i in range(10):
  for j in range(len(X)):
    if(i==Y[j]):
      Images.append(X[j])
      break

wandb.log({"Examples": [ wandb.Image(img, caption=caption) for img, caption in zip(Images,Tags)]})
#Plotting the 10 classes



Q2. Implement a feedforward neural network which takes images from the fashion-mnist data as input and outputs a probability distribution over the 10 classes.

In [None]:
#Vector Implementation and Reshaping the training and testing data set

l1,l2=len(X),len(X_test)
flat=len(X[0].flatten())
X=(X.reshape(l1,flat))/255.0
Y= Y.reshape(l1,1)
X_test=(X_test.reshape(l2,flat))/255.0
Y_test=Y_test.reshape(l2,1)
X_train,X_validation,Y_train,Y_validation = train_test_split(X,Y, test_size=0.1, random_state=1234)


In [None]:
X_train[0:10].shape

In [None]:
print(X_train.shape,X_validation.shape,X_test.shape)
print(Y_train.shape,Y_validation.shape,Y_test.shape)

In [None]:
#Defining all the required Functions for Activation and Preactivation

def preActivation(b,w,h):
  
  A= b + np.matmul(w,h)
  return A 

def Activation(x,act_func):
  if(act_func=='sigmoid'):
    return sigmoid(x)
  elif (act_func=='tanh'):
    return tanh(x)
  elif(act_func=='relu'):
    return relu(x)

#Defining the activation functions required
  
def sigmoid(x):
  return 1.0/(1.0 + np.exp(-x))

def relu(x):
    return np.maximum(0,x)

def tanh(x):
    return np.tanh(x)

#Softmax function for the output as probabilities
def softmax(x):
  return (np.exp(x))/(np.sum(np.exp(x)))


#Converting class number to vector
def to_vector(x):
  res=np.zeros(10)
  res[x]=1
  return res.reshape(10,1)


#Function to find the derivatives of the activation functions
def derivative(func, x):
    if func == 'sigmoid':
        return np.multiply(sigmoid(x), 1 - sigmoid(x))
    elif func == 'tanh':
        return 1 - np.square(np.tanh(x))
    elif func == 'relu':
        return (x > 0) * (np.ones((x.shape[0],1)))

#Function to calculate the losses based on the true class, estimated classes and loss function
 
def Loss_Func(y_hat,y_true,lfunc):
    yp = np.array(y_hat).reshape(-1)
    yt = np.array(y_true).reshape(-1)
    if lfunc == 'cross entropy':
        loss = np.sum((-1)*(yt*np.log(yp))) #+ regularization/2*(sum_norm)  
    elif lfunc == 'squared error':
        loss = np.mean((yt-yp)**2) #+ regularization/2*(sum_norm) 
    return loss


In [None]:
#Initializing the weights and biases with normal random numbers

def init_weights_biases(Layers,init_func,act_func):
  W={}    #Empty dictionary to store weights
  B={}    #Empty dictionary to store biases

  #For random initialization
  if (init_func=='random'):
      if(act_func!= 'relu'):
        for i in range(1,len(Layers)):
          W[i]=np.random.randn(Layers[i],Layers[i-1])     
          B[i]=np.random.randn(Layers[i],1)               
      else:
        for i in range(1,len(Layers)):
          W[i]=np.random.randn(Layers[i],Layers[i-1])*(np.sqrt(2/(Layers[i]+Layers[i-1])))
          B[i]=np.zeros((Layers[i],1))

  #For xavier initialization
  elif( init_func=='xavier'):
      if (act_func != 'relu'): 
          for i in range(1, len(Layers)):
            lower = -1/ np.sqrt(Layers[i-1])
            upper = 1/ np.sqrt(Layers[i-1])
            W[i] = lower + np.random.randn(Layers[i],Layers[i-1])*(upper- lower)
            B[i] = lower + np.random.randn(Layers[i],1)*(upper - lower)

      else: 
          for i in range(1,len(Layers)): 
            W[i] = np.random.randn(Layers[i],Layers[i-1])*(np.sqrt(2/(Layers[i-1])))
            B[i]= np.random.randn(Layers[i], 1)*(np.sqrt(2/(Layers[i-1])))

  return W,B


In [None]:
#Initializing the weights and biases with zeros

def init_zeros(Layers):
  P={}    #Empty dictionary to store zero weights
  Q={}    #Empty dictionary to store zero biases
  for i in range(1,len(Layers)):
        P[i]=np.zeros((Layers[i],Layers[i-1]))
        Q[i]=np.zeros((Layers[i],1))
  return P,Q


In [None]:
#Forward Propagation

def forw_prop(X,W,B,no_hidden_layers,act_func):
    L=no_hidden_layers+1
    A={}    #Empty dictionary to store pre-activations
    H={}    #Empty dictionary to store activations
    A[0]=X.reshape(784,1)   #First element stored is flattened image input
    H[0]=X.reshape(784,1)   #First element stored is flattened image input
    
    for i in range(1,L):
      A[i]=preActivation(B[i],W[i],H[i-1])    #Storing the pre-activations
      H[i]=Activation(A[i],act_func)          #Storing the activations

    A[L]=preActivation(B[L],W[L],H[L-1])      #Calculate the output layer
    y_hat=softmax(A[L])                       #Calculate the softmax output from the output layer 
    return y_hat,A,H


In [None]:
#Backward Propagation

def back_prop(y_hat,W,B,A,H,y_true,lfunc,act_func,Layers):
    L=len(Layers)-1
    delta_A={}    #Creating empty dictionary to store delta A's
    delta_H={}    #Creating empty dictionary to store delta H's
    delta_W={}    #Creating empty dictionary to store delta W's
    delta_B={}    #Creating empty dictionary to store delta B's

    #Delta A for the last layer
    if(lfunc=='cross entropy'):
        delta_A[L] = -(y_true-y_hat)
    
    #Delta A for the hidden layers based on the type of activation functions
    for i in range(L, 0, -1):
        delta_W[i] = np.matmul(delta_A[i], H[i-1].T) # + regularization*W[i]
        delta_B[i] = delta_A[i]
        delta_H[i-1] = np.matmul(W[i].T, delta_A[i])
        if(act_func=='sigmoid'):
          delta_A[i-1] = np.multiply(delta_H[i-1], derivative('sigmoid',A[i-1]))
        if(act_func=='tanh'):
          delta_A[i-1] = np.multiply(delta_H[i-1], derivative('tanh',A[i-1]))
        if(act_func=='relu'):
          delta_A[i-1] = np.multiply(delta_H[i-1], derivative('relu',A[i-1]))

    return delta_W,delta_B


In [None]:
def loss_compute(x,y,W,B, Layers, act_func, l_func):
    loss=0
    #print(x.shape,y.shape)
    for i in range(len(x)):
        x1 = x[i].reshape(784,1)
        yt= to_vector(y[i])
        y_hat,a,h = forw_prop(x1,W,B,len(Layers)-2,act_func)
        loss +=  (Loss_Func(y_hat,yt,l_func))
        
    loss=(loss/(len(x)))
    return loss

In [None]:
#Calculate the output using forward propagation
def Output(x, W,B,Layers, act_func):
    output = []
    for i in range(len(x)):
        y_hat, a,h = forw_prop(x[i].reshape(784,1), W,B,len(Layers)-2, act_func)
        yp=np.argmax(y_hat,axis = 0)
        output.append(yp)
    return output

In [None]:
W={}
B={}
Layers=[784,16,16,10]
W,B =init_weights_biases(Layers,'xavier','relu') 
y_pred = Output(X_validation,W,B,Layers, 'relu')
print(accuracy_score(Y_validation, y_pred))
print(loss_compute(X_validation,Y_validation,W,B, Layers,'relu','cross entropy'))

In [None]:
# Generates mini batches from the training data
'''def mini_batch_generation(train_data, train_labels, batch_size ):
    
    no_of_training_examples = train_data.shape[1]                 
    batches = []

    number_of_complete_minibatches = math.floor(no_of_training_examples/batch_size)

    for i in range(number_of_complete_minibatches):
        batch_input = train_data[:, i*batch_size : (i+1)*batch_size]
        batch_output = train_labels[:, i*batch_size : (i+1)*batch_size]
        created_batch = (batch_input, batch_output)
        batches.append(created_batch)
    
    if no_of_training_examples % batch_size != 0:
        last_batch_input = train_data[:, int(no_of_training_examples/batch_size)*batch_size : ]
        last_batch_output = train_labels[:, int(no_of_training_examples/batch_size)*batch_size : ]
        last_batch = (last_batch_input, last_batch_output)
        batches.append(last_batch)
    
    return batches'''

In [None]:
#Stochastic Gradient Descent

def StochasticGD(x,y,epochs,eta,batch,no_hidden_layers,hidden_layer_size,act_func,init_func,l_func):
  #Store the size of the layers
  Layers=[784] + [hidden_layer_size]*no_hidden_layers + [10]
  #Empty list to store total loss
  total_loss=[]
  
  #initialize W,B
  W,B=init_weights_biases(Layers,init_func,act_func)

  for i in range(epochs):
      #initialize dW,dB to zeros
      dW,dB=init_zeros(Layers)

      #rd=np.arange(len(X_train))
      #np.random.shuffle(rd)
      count=0
      st=en=0

      for j in range(len(x)):
          count+=1
          #idx=rd[j]
          #xt,yt=X_train[idx].reshape(784,1),Y_train[idx]
          xt,yt=x[j].reshape(784,1),y[j]
          #Perform forward prop to get predictions, activations and pre-activations
          y_hat,A,H=forw_prop(xt,W,B,len(Layers)-2,act_func)
          #Convert acutal class to vector
          y_true=to_vector(yt)
          #Calculate the gradients wrt weights and biases
          grad_w,grad_b=back_prop(y_hat,W,B,A,H,y_true,l_func,act_func,Layers)

          for l in range(1,len(Layers)):
              #Add the gradients to dW, dB
              dW[l]+= grad_w[l]
              dB[l]+= grad_b[l]
          
          #When one batch is complete
          if (count % batch == 0):
              en=count

              for l in range(1,len(Layers)):
                  #Update the weights and biases layer-wise
                  W[l]-=eta*dW[l]/batch   
                  B[l]-=eta*dB[l]/batch
              
              total_loss.append(loss_compute(x[st:en],y[st:en],W,B, Layers,act_func,l_func))
              st=en
            
              dW,dB=init_zeros(Layers)
      
      #Predictions on validation data
      yp_val = Output(X_validation, W,B,Layers, act_func)
      #Validation accuracy 
      acc_val = accuracy_score(Y_validation, yp_val)
      #Outputs on training data
      yp_train = Output(x, W,B,Layers, act_func)
      #Training accuracy
      acc_train = accuracy_score(y, yp_train)
      #Loss in validation
      val_loss = loss_compute(X_validation,Y_validation,W,B, Layers, act_func, l_func) 
      #Loss in training
      train_loss = loss_compute(x,y,W,B, Layers, act_func, l_func)

      #Making plots in wandb
      wandb.log({"val_acc": acc_val})
      wandb.log({'epochs': i})
      wandb.log({'train_acc': acc_train})
      wandb.log({'train_loss': train_loss})
      wandb.log({'val_loss': val_loss})

  return W,B,total_loss 
     
  

In [None]:
#Ws,Bs,l=StochasticGD(X_train,Y_train,5,0.1,64,2,16,'tanh','xavier','cross entropy')

In [None]:
'''y_pred = Output(X_validation,Ws,Bs,[784,16,16,10], 'tanh')
print(accuracy_score(Y_validation, y_pred))'''


In [None]:
'''plt.plot(l)
plt.show()'''

In [None]:
#Momentum Based Gradient Descent

def MomentumBasedGD(x,y,epochs,eta,gamma,batch,no_hidden_layers,hidden_layer_size,act_func,init_func,l_func):
  #Store the size of the layers
  Layers=[784] + [hidden_layer_size]*no_hidden_layers + [10]
  #Empty list to store total loss
  total_loss=[]
  
  #initialize W,B
  W,B=init_weights_biases(Layers,init_func,act_func)

  #initialize vW,vB
  vW,vB=init_zeros(Layers)

  #initialize prev_vW,prev_vB
  prev_vW,prev_vB=init_zeros(Layers)


  for i in range(epochs):
      
      #initialize dW,dB
      dW,dB=init_zeros(Layers)

      #rd=np.arange(len(X_train))
      #np.random.shuffle(rd)
      count=0
      st=en=0

      for j in range(len(x)):
          count+=1
          #idx=rd[j]
          #xt,yt=X_train[idx].reshape(784,1),Y_train[idx]
          xt,yt=x[j].reshape(784,1),y[j]
          #Perform forward prop to get predictions, activations and pre-activations
          y_hat,A,H=forw_prop(xt,W,B,len(Layers)-2,act_func)
          #Convert actual class to vector 
          y_true=to_vector(yt)
          #Get the gradients wrt weights and biases
          grad_w,grad_b=back_prop(y_hat,W,B,A,H,y_true,l_func,act_func,Layers)

          for l in range(1,len(Layers)):
              #Add gradients to dW, dB layerwise
              dW[l]+= grad_w[l]
              dB[l]+= grad_b[l]

          #Batch is complete
          if (count % batch == 0):
              en=count
              for l in range(1,len(Layers)):
                  #Update(t) = gamma * Update(t-1) + eta * momentum
                  vW[l]=gamma*prev_vW[l] + eta*dW[l]/batch
                  vB[l]=gamma*prev_vB[l] + eta*dB[l]/batch
                  #Update the weights and biases 
                  W[l]-=vW[l]
                  B[l]-=vB[l]
                  #Assign last update as prev update
                  prev_vW[l]=vW[l]
                  prev_vB[l]=vB[l]
              #Append to loss
              total_loss.append(loss_compute(x[st:en],y[st:en],W,B, Layers,act_func,l_func))
              st=en

              dW,dB=init_zeros(Layers)
      
      #Predictions on validation data
      yp_val = Output(X_validation, W,B,Layers, act_func)
      #Accuracy on validation data
      acc_val = accuracy_score(Y_validation, yp_val)
      #Predictions on training data
      yp_train = Output(x, W,B,Layers, act_func)
      #Accuracy on training data
      acc_train = accuracy_score(y, yp_train)
      #Loss in validation
      val_loss = loss_compute(X_validation,Y_validation,W,B, Layers, act_func, l_func) 
      #Loss in training
      train_loss = loss_compute(x,y,W,B, Layers, act_func, l_func)

      #Making plots in wandb
      wandb.log({"val_acc": acc_val})
      wandb.log({'epochs': i})
      wandb.log({'train_acc': acc_train})
      wandb.log({'train_loss': train_loss})
      wandb.log({'val_loss': val_loss})

  return W,B,total_loss


In [None]:
#Ws,Bs,l=StochasticGD(X_train,Y_train,2,0.1,64,2,16,'tanh','xavier','cross entropy')
#Wm,Bm,lm=MomentumBasedGD(X_train,Y_train,2,0.1,0.9,64,2,16,'tanh','xavier','cross entropy')

In [None]:
'''y_pred = Output(X_validation,Wm,Bm,[784,16,16,10], 'sigmoid')
print(accuracy_score(Y_validation, y_pred))
plt.plot(lm)
plt.show()'''

In [None]:
#Nesterov Accelerated Gradient Descent

def NesterovAGD(x,y,epochs,eta,gamma,batch,no_hidden_layers,hidden_layer_size,act_func,init_func,l_func):
  #Store the size of the layers
  Layers=[784] + [hidden_layer_size]*no_hidden_layers + [10]
  #Empty list to store total loss
  total_loss=[]
  
  #initialize W,B
  W,B=init_weights_biases(Layers,init_func,act_func)

  #initialize vW,vB
  vW,vB=init_zeros(Layers)

  #initialize prev_vW,prev_vB
  prev_vW,prev_vB=init_zeros(Layers)

  #initialize look_ahead_vW,look_ahead_vB
  look_ahead_W,look_ahead_B=init_zeros(Layers)

  for l in range(1,len(Layers)):
      look_ahead_W[l]=W[l]-gamma*prev_vW[l]
      look_ahead_B[l]=B[l]-gamma*prev_vB[l]

  for i in range(epochs):
      
      #initialize dW,dB
      dW,dB=init_zeros(Layers)

      #rd=np.arange(len(X_train))
      #np.random.shuffle(rd)
      count=0
      st=en=0

      for j in range(len(x)):
          count+=1
          #idx=rd[j]
          #xt,yt=X_train[idx].reshape(784,1),Y_train[idx]
          xt,yt=x[j].reshape(784,1),y[j]
          #Perform forward prop to get predictions, activations and pre-activations
          y_hat,A,H=forw_prop(xt,look_ahead_W,look_ahead_B,len(Layers)-2,act_func)
          #Convert acutal class to vector
          y_true=to_vector(yt)
          #Calculate the gradients wrt weights and biases
          grad_w,grad_b=back_prop(y_hat,look_ahead_W,look_ahead_B,A,H,y_true,l_func,act_func,Layers)

          for l in range(1,len(Layers)):
              #Add gradients to dW, dB layerwise
              dW[l]+= grad_w[l]
              dB[l]+= grad_b[l]

          #Batch is complete
          if (count % batch == 0):
              en=count 
              for l in range(1,len(Layers)):
                  #Update(t) = gamma * Update(t-1) + eta * momentum
                  vW[l]=gamma*prev_vW[l] + eta*dW[l]/batch
                  vB[l]=gamma*prev_vB[l] + eta*dB[l]/batch
                  #Update the weights
                  W[l]-=vW[l]
                  B[l]-=vB[l]
                  #Assign current update as preious update
                  prev_vW[l]=vW[l]
                  prev_vB[l]=vB[l]
                  #Calculate look ahead point
                  look_ahead_W[l]=W[l]-gamma*prev_vW[l]
                  look_ahead_B[l]=B[l]-gamma*prev_vB[l]

              #Append loss to the total_loss list
              total_loss.append(loss_compute(x[st:en],y[st:en],W,B, Layers,act_func,l_func))
              st=en

              dW,dB=init_zeros(Layers)
      

      #Predictions on validation data
      yp_val = Output(X_validation, W,B,Layers, act_func)
      #Accuracy on validation data
      acc_val = accuracy_score(Y_validation, yp_val)
      #Output in training data
      yp_train = Output(x, W,B,Layers, act_func)
      #Accuracy in training data
      acc_train = accuracy_score(y, yp_train)
      #Loss in validation
      val_loss = loss_compute(X_validation,Y_validation,W,B, Layers, act_func, l_func) 
      #Loss in training
      train_loss = loss_compute(x,y,W,B, Layers, act_func, l_func)
      #Making plots in wandb
      wandb.log({"val_acc": acc_val})
      wandb.log({'epochs': i})
      wandb.log({'train_acc': acc_train})
      wandb.log({'train_loss': train_loss})
      wandb.log({'val_loss': val_loss})
  return W,B,total_loss

In [None]:
'''Wn,Bn,ln=NesterovAGD(X_train,Y_train,1,0.1,0.9,64,2,16,'sigmoid','random','cross entropy')'''

In [None]:
'''y_pred = Output(X_validation,Wn,Bn,[784,16,16,10], 'sigmoid')
print(accuracy_score(Y_validation, y_pred))
plt.plot(ln)
plt.show()'''

In [None]:
#Gradient Descent with RMSprop:

def RMSprop(x,y,epochs,eta,beta,eps,batch,no_hidden_layers,hidden_layer_size,act_func,init_func,l_func):
  #Store the size of the layers
  Layers=[784] + [hidden_layer_size]*no_hidden_layers + [10]
  #Empty list to store total loss
  total_loss=[]
  
  #initialize W,B
  W,B=init_weights_biases(Layers,init_func,act_func)

  #initialize vW,vB
  vW,vB=init_zeros(Layers)

  #initialize prev_vW,prev_vB
  prev_vW,prev_vB=init_zeros(Layers)

  for i in range(epochs):

      #initialize dW,dB
      dW,dB=init_zeros(Layers)

      #rd=np.arange(len(X_train))
      #np.random.shuffle(rd)
      count=0
      st=en=0

      for j in range(len(x)):
          count+=1
          #idx=rd[j]
          #xt,yt=X_train[idx].reshape(784,1),Y_train[idx]
          xt,yt=x[j].reshape(784,1),y[j]
          #Perform forward propagation to get y_hat, activations, and pre-activations
          y_hat,A,H=forw_prop(xt,W,B,len(Layers)-2,act_func)
          #Convert actual class to vector
          y_true=to_vector(yt)
          #Calculate the gradients wrt to weights and biases
          grad_w,grad_b=back_prop(y_hat,W,B,A,H,y_true,l_func,act_func,Layers)
          for l in range(1,len(Layers)):
              #Add gradients to dW, dB layerwise
              dW[l]+= grad_w[l]
              dB[l]+= grad_b[l]
          
          #Batch is complete
          if(count % batch == 0):
              en=count
              for l in range(1,len(Layers)):
                  #Update step
                  vW[l]=beta*prev_vW[l] + (1-beta)*(dW[l]**2)/batch
                  vB[l]=beta*prev_vB[l] + (1-beta)*(dB[l]**2)/batch
                  #Update the weights and biases
                  W[l]-=(eta/(np.sqrt(vW[l]+eps)))*(dW[l])/batch
                  B[l]-=(eta/(np.sqrt(vB[l]+eps)))*(dB[l])/batch
                  #Assign current update as previous update
                  prev_vW[l]=vW[l]
                  prev_vB[l]=vB[l]

              #Append loss to the total_loss list
              total_loss.append(loss_compute(x[st:en],y[st:en],W,B, Layers,act_func,l_func))
              st=en

              dW,dB=init_zeros(Layers)

      #Predictions on validation data
      yp_val = Output(X_validation, W,B,Layers, act_func)
      #Accuracy on validation data
      acc_val = accuracy_score(Y_validation, yp_val)
      #Predictions on train data
      yp_train = Output(x, W,B,Layers, act_func)
      #Accuracy on train data
      acc_train = accuracy_score(y, yp_train)
      #Loss in validation
      val_loss = loss_compute(X_validation,Y_validation,W,B, Layers, act_func, l_func) 
      #Loss in training
      train_loss = loss_compute(x,y,W,B, Layers, act_func, l_func)
      #Making plots in wandb
      wandb.log({"val_acc": acc_val})
      wandb.log({'epochs': i})
      wandb.log({'train_acc': acc_train})
      wandb.log({'train_loss': train_loss})
      wandb.log({'val_loss': val_loss})

  return W,B,total_loss

In [None]:
Wr,Br,lr=RMSprop(X_train,Y_train,5,0.1,0.9,1e-8,16,4,64,'sigmoid','random','cross entropy')

In [None]:
'''y_pred = Output(X_validation,Wr,Br,[784,16,16,10], 'sigmoid')
print(accuracy_score(Y_validation, y_pred))
plt.plot(lr)
plt.show()'''

In [None]:
#Gradient Descent with Adam:

def Adam(x,y,epochs,eta,beta1,beta2,eps,batch,no_hidden_layers,hidden_layer_size,act_func,init_func,l_func):
  #Store the size of the layers
  Layers=[784] + [hidden_layer_size]*no_hidden_layers + [10]
  #Empty list to store total loss
  total_loss=[]
  
  t=0
  #initialize W,B
  W,B=init_weights_biases(Layers,init_func,act_func)

  #initialize vW,vB
  vW,vB=init_zeros(Layers)

  #initialize mW,mB
  mW,mB=init_zeros(Layers)

  #initialize prev_mW,prev_mB
  prev_mW,prev_mB=init_zeros(Layers)

  #initialize prev_vW,prev_vB
  prev_vW,prev_vB=init_zeros(Layers)

  #initialize mW_hat,mB_hat
  mW_hat,mB_hat=init_zeros(Layers)

  #initialize vW_hat,vB_hat
  vW_hat,vB_hat=init_zeros(Layers)

  for i in range(epochs):

      #initialize dW,dB
      dW,dB=init_zeros(Layers)

      #rd=np.arange(len(X_train))
      #np.random.shuffle(rd)
      count=0
      st=en=0

      for j in range(len(X_train)):
          count+=1
          #idx=rd[j]
          #xt,yt=X_train[idx].reshape(784,1),Y_train[idx]
          xt,yt=x[j].reshape(784,1),y[j]
          #Perform forward propagation to get y_hat, Activations and pre-activations
          y_hat,A,H=forw_prop(xt,W,B,len(Layers)-2,act_func)
          #Convert true class to vector form 
          y_true=to_vector(yt)
          #Calculate the gradients wrt weights and biases
          grad_w,grad_b=back_prop(y_hat,W,B,A,H,y_true,l_func,act_func,Layers)

          #Add the gradients to dW, dB layerwise
          for l in range(1,len(Layers)):
              dW[l]+= grad_w[l]
              dB[l]+= grad_b[l]

          #Batch is complete
          if(count % batch == 0):
              en=count
              t+=1

              for l in range(1,len(Layers)):
                  mW[l]=beta1*prev_mW[l] + (1-beta1)*dW[l]/batch
                  mB[l]=beta1*prev_mB[l] + (1-beta1)*dB[l]/batch

                  vW[l]=beta2*prev_vW[l] + (1-beta2)*(dW[l]**2)/batch
                  vB[l]=beta2*prev_vB[l] + (1-beta2)*(dB[l]**2)/batch

                  mW_hat[l]=mW[l]/(1-np.power(beta1,t))
                  mB_hat[l]=mB[l]/(1-np.power(beta1,t))

                  vW_hat[l]=vW[l]/(1-np.power(beta2,t))
                  vB_hat[l]=vB[l]/(1-np.power(beta2,t))

                  W[l]-=(eta/(np.sqrt(vW_hat[l]+eps)))*mW_hat[l]
                  B[l]-=(eta/(np.sqrt(vB_hat[l]+eps)))*mB_hat[l]

                  prev_mW[l]=mW[l]
                  prev_mB[l]=mB[l]

                  prev_vW[l]=vW[l]
                  prev_vB[l]=vB[l]

              total_loss.append(loss_compute(x[st:en],y[st:en],W,B, Layers,act_func,l_func))
              st=en

              dW,dB=init_zeros(Layers)
      
      #Predictions on validation data
      yp_val = Output(X_validation, W,B,Layers, act_func)
      #Accracy on validation data
      acc_val = accuracy_score(Y_validation, yp_val)
      #Predictions on train data
      yp_train = Output(x, W,B,Layers, act_func)
      #Accuracy on train data
      acc_train = accuracy_score(y, yp_train)
      #Loss in validation
      val_loss = loss_compute(X_validation,Y_validation,W,B, Layers, act_func, l_func) 
      #Loss in training
      train_loss = loss_compute(x,y,W,B, Layers, act_func, l_func)
      #Making plots in wandb
      wandb.log({"val_acc": acc_val})
      wandb.log({'epochs': i})
      wandb.log({'train_acc': acc_train})
      wandb.log({'train_loss': train_loss})
      wandb.log({'val_loss': val_loss})

  return W,B,total_loss

In [None]:
Wa,Ba,la=Adam(X_train,Y_train,5,0.1,0.9,0.999,1e-8,16,4,64,'sigmoid','random','cross entropy')





In [None]:
'''y_pred = Output(X_validation,Wa,Ba,[784,16,16,10], 'sigmoid')
print(accuracy_score(Y_validation, y_pred))
plt.plot(la)
plt.show()'''

In [None]:
#Gradient Descent with Nadam:

def Nadam(x,y,epochs,eta,gamma,beta1,beta2,eps,batch,no_hidden_layers,hidden_layer_size,act_func,init_func,l_func):
  #Store the size of the layers
  Layers=[784] + [hidden_layer_size]*no_hidden_layers + [10]
  #Empty list to store total loss
  total_loss=[]
  
  t=0
  #initialize W,B
  W,B=init_weights_biases(Layers,init_func,act_func)

  #initialize vW,vB
  vW,vB=init_zeros(Layers)

  #initialize mW,mB
  mW,mB=init_zeros(Layers)

  #initialize prev_mW,prev_mB
  prev_mW,prev_mB=init_zeros(Layers)

  #initialize prev_vW,prev_vB
  prev_vW,prev_vB=init_zeros(Layers)

  #initialize mW_hat,mB_hat
  mW_hat,mB_hat=init_zeros(Layers)

  #initialize vW_hat,vB_hat
  vW_hat,vB_hat=init_zeros(Layers)

  #initialize look_ahead_vW,look_ahead_vB
  look_ahead_W,look_ahead_B=init_zeros(Layers)

  for l in range(1,len(Layers)):
      look_ahead_W[l]=W[l]-gamma*prev_vW[l]
      look_ahead_B[l]=B[l]-gamma*prev_vB[l]
 

  for i in range(epochs):

      #initialize dW,dB
      dW,dB=init_zeros(Layers)

      #rd=np.arange(len(X_train))
      #np.random.shuffle(rd)
      count=0
      st=en=0

      for j in range(len(x)):
          count+=1
          #idx=rd[j]
          #xt,yt=X_train[idx].reshape(784,1),Y_train[idx]
          xt,yt=x[j].reshape(784,1),y[j]
          y_hat,A,H=forw_prop(xt,look_ahead_W,look_ahead_B,len(Layers)-2,act_func)
          y_true=to_vector(yt)
          grad_w,grad_b=back_prop(y_hat,look_ahead_W,look_ahead_B,A,H,y_true,l_func,act_func,Layers)
          for l in range(1,len(Layers)):
              dW[l]+= grad_w[l]
              dB[l]+= grad_b[l]

          if(count % batch == 0):
              en=count 
              t+=1

              for l in range(1,len(Layers)):
                  mW[l]=beta1*prev_mW[l] + (1-beta1)*dW[l]/batch
                  mB[l]=beta1*prev_mB[l] + (1-beta1)*dB[l]/batch

                  vW[l]=beta2*prev_vW[l] + (1-beta2)*(dW[l]**2)/batch
                  vB[l]=beta2*prev_vB[l] + (1-beta2)*(dB[l]**2)/batch

                  mW_hat[l]=mW[l]/(1-np.power(beta1,t))
                  mB_hat[l]=mB[l]/(1-np.power(beta1,t))

                  vW_hat[l]=vW[l]/(1-np.power(beta2,t))
                  vB_hat[l]=vB[l]/(1-np.power(beta2,t))

                  W[l]-=(eta/(np.sqrt(vW_hat[l]+eps)))*mW_hat[l]
                  B[l]-=(eta/(np.sqrt(vB_hat[l]+eps)))*mB_hat[l]

                  prev_mW[l]=mW[l]
                  prev_mB[l]=mB[l]

                  prev_vW[l]=vW[l]
                  prev_vB[l]=vB[l]

                  look_ahead_W[l]=W[l]-gamma*prev_vW[l]
                  look_ahead_B[l]=B[l]-gamma*prev_vB[l]

              #total_loss.append(loss_compute(x[st:en],y[st:en],W,B, Layers,act_func,l_func))
              st=en

              dW,dB=init_zeros(Layers)

      #Predictions on validation data
      yp_val = Output(X_validation, W,B,Layers, act_func)
      #Accracy on validation data
      acc_val = accuracy_score(Y_validation, yp_val)
      #Predictions on train data
      yp_train = Output(x, W,B,Layers, act_func)
      #Accuracy on train data
      acc_train = accuracy_score(y, yp_train)
      #Loss in validation
      val_loss = loss_compute(X_validation,Y_validation,W,B, Layers, act_func, l_func) 
      #Loss in training
      train_loss = loss_compute(x,y,W,B, Layers, act_func, l_func)
      #Making plots in wandb
      wandb.log({"val_acc": acc_val})
      wandb.log({'epochs': i})
      wandb.log({'train_acc': acc_train})
      wandb.log({'train_loss': train_loss})
      wandb.log({'val_loss': val_loss})

  return W,B,total_loss



In [None]:
Wna,Bna,lna=Nadam(X_train,Y_train,5,0.1,0.9,0.9,0.999,1e-8,16,4,64,'sigmoid','random','cross entropy')

In [None]:
'''y_pred = Output(X_validation,Wna,Bna,[784,16,16,10], 'sigmoid')
print(accuracy_score(Y_validation, y_pred))
plt.plot(lna)
plt.show()'''

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'accuracy',
      'goal': 'maximize'   
    },
    'early_terminate': {
            'type': 'hyperband',
            'min_iter': [3],
            's': [2]
    },
    'parameters': {
        'epochs': {
            'values': [5,10] #number of epochs
        },
        'number_hidden': {
            'values': [3, 4, 5] #number of hidden layers
        },
        'hidden_inputsize': {
            'values': [32, 64, 128] #size of every hidden layer
        },
        'weight_decay': {
            'values': [0, 0.0005,  0.5] #L2 regularisation
        },
        'learning_rate': {
            'values': [1e-3, 1e-4] #values of eta
        },
        'optimizer': {
            'values': [ 'sgd', 'momentum', 'nesterov', 'rmsprop', 'adam', 'nadam'] #algorithms for training
        },
        'batch_size' : {
            'values':[16, 32, 64] #Sizes of batches to be used
        },
        'weight_init': {
            'values': ['random','xavier'] #Types of initializations
        },
        'activation': {
            'values': ['sigmoid','tanh','relu'] #Types of activations
        }
        
        }
}

In [None]:
# Initialize a new sweep
# Arguments:
# param_config: the sweep config dictionary defined above
# entity: Set the username for the sweep
# project: Set the project name for the sweep
sweep_id = wandb.sweep(sweep_config, entity="nomads", project="CS6910_DL_Assignment1")

In [None]:
kdef train():
    config_defaults = {
        'num_of_epochs': 5,
        'number_of_hidden_layers': 3,
        'hidden_layer_size': 64,
        'loss_function':'cross entropy',
        'learning_rate': 1e-3,
        'optimizer': 'adam',
        'batch_size': 64,
        'activation': 'relu',
        'weight_initialization': 'xavier',
        'gamma' : 0.9,
        'epsilon' : 1e-5,
        'beta': 0.95,
        'beta1' : 0.9,
        'beta2' : 0.999
    }

    wandb.init(config=config_defaults,resume=True)
    config = wandb.config 
    wandb.run.name = "hl_" + str(config.number_of_hidden_layers)+ "_layer-size_" + str(config.hidden_layer_size) +"lf"+config.loss_function+ "_bs_"+str(config.batch_size)+"_ac_"+ (config.activation) + "_init_" + (config.weight_initialization) + "_opt_" + (config.optimizer)

    if config.optimizer=="sgd":
        StochasticGD(X_train, Y_train,config.num_of_epochs,config.learning_rate, config.batch_size, config.number_of_hidden_layers, config.hidden_layer_size,  
              config.activation, config.weight_initialization,config.loss_function)
    
    elif config.optimizer=="momentum":  
        MomentumBasedGD(X_train, Y_train,config.num_of_epochs,config.learning_rate,config.gamma, config.batch_size, config.number_of_hidden_layers, config.hidden_layer_size,  
              config.activation, config.weight_initialization,config.loss_function)
        
    elif config.optimizer=='nesterov':
        NesterovAGD(X_train, Y_train,config.num_of_epochs,config.learning_rate,config.gamma, config.batch_size, config.number_of_hidden_layers, config.hidden_layer_size,  
              config.activation, config.weight_initialization,config.loss_function)
        
    elif config.optimizer=="rmsprop":
        RMSprop(X_train, Y_train,config.num_of_epochs,config.learning_rate,config.beta,config.epsilon, config.batch_size, config.number_of_hidden_layers, config.hidden_layer_size,  
              config.activation, config.weight_initialization,config.loss_function)

    elif config.optimizer=='adam':
        Adam(X_train, Y_train,config.num_of_epochs,config.learning_rate,config.beta1,config.beta2,config.epsilon, config.batch_size, config.number_of_hidden_layers, config.hidden_layer_size,  
              config.activation, config.weight_initialization,config.loss_function) 
    
    elif config.optimizer=='nadam':
        Nadam(X_train, Y_train,config.num_of_epochs,config.learning_rate,config.gamma,config.beta1,config.beta2,config.epsilon, config.batch_size, config.number_of_hidden_layers, config.hidden_layer_size,  
              config.activation, config.weight_initialization,config.loss_function)

In [None]:
wandb.agent('4l43rew9',train,count=1)