<a href="https://colab.research.google.com/github/Swathi1309/ED18B034_ME18B133_Assignment1/blob/main/Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Image Classification using MLP**












# Install and Initialization of packages



In [None]:
!pip install wandb
!wandb login

In [None]:
import numpy as np
from numpy import expand_dims

import keras
from keras.datasets import fashion_mnist
from keras.datasets import mnist
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import ImageDataGenerator

import tensorflow as tf
from tensorflow.keras import initializers

import random
import pprint
import wandb
from PIL import Image

wandb.init(project="CS6910-assg1", entity="swathi")
classes = ['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle boot']

# Loading dataset and example images

In [63]:
def augmentation(x, y, frac_hor, frac_vert, frac_flip):
  
  # Horizontal shift
  num = int(frac_hor*x.shape[0])
  rand_num_hor = np.random.randint(x.shape[0]-1,size =(num))
  random = np.array(x[rand_num_hor])
  rand = random.reshape((num, 28, 28, 1))
  datagen = ImageDataGenerator(width_shift_range=[-1,1])
  it = datagen.flow(rand, batch_size=num)
  batch = it.next()
  x_new = np.append(x,batch.reshape([num,28,28]), axis=0)
  
  # Vertical shift
  num = int(frac_vert*x.shape[0])
  rand_num_vert = np.random.randint(x.shape[0]-1,size =(num))
  random = np.array(x[rand_num_vert])
  rand = random.reshape((num, 28, 28, 1))
  datagen = ImageDataGenerator(width_shift_range=[-1,1])
  it = datagen.flow(rand, batch_size=num)
  batch = it.next()
  x_new = np.append(x_new,batch.reshape([num,28,28]), axis=0)
  
  # Flip
  num = int(frac_flip*x.shape[0])
  rand_num_flip = np.random.randint(x.shape[0]-1,size =(num))
  random = np.array(x[rand_num_flip])
  rand = random.reshape((num, 28, 28, 1))
  datagen = ImageDataGenerator(width_shift_range=[-1,1])
  it = datagen.flow(rand, batch_size=num)
  batch = it.next()
  x_new = np.append(x_new, batch.reshape([num,28,28]), axis=0)
  y_new = np.append(y, y[rand_num_hor])
  y_new = np.append(y_new, y[rand_num_vert])
  y_new = np.append(y_new, y[rand_num_flip])
  return x_new,y_new

In [69]:
def load_data(dataset = 'fashion_mnist'):
  if dataset == 'fashion_mnist':
    (x_train,y_train),(x_test,y_test) = fashion_mnist.load_data()
  if dataset =='mnist':
    (x_train, y_train),(x_test,y_test) = mnist.load_data()
  num_train = int(0.9*x_train.shape[0])
  num_val = int(0.1*x_train.shape[0])
  num_test = x_test.shape[0]
  x_train_temp = x_train[:num_train]
  y_train_temp = y_train[:num_train]
  x_train_new , y_train_new = augmentation(x_train_temp,y_train_temp,0.2,0.2,0.2)
  features = x_train.shape[1]*x_train.shape[2]
  X_train = np.transpose((np.array(x_train_new,dtype=int).reshape([x_train_new.shape[0],features]))/255)
  Y_train = np.transpose((np.array(y_train_new,dtype=int).reshape([x_train_new.shape[0],1])))
  X_validation = np.transpose((np.array(x_train[num_train:],dtype=int).reshape([num_val,features]))/255)
  Y_validation = np.transpose(np.array(y_train[num_train:],dtype=int).reshape([num_val,1]))
  X_test = np.transpose((np.array(x_test,dtype=int).reshape([num_test,features]))/255)
  Y_test = np.transpose(np.array(y_test,dtype=int).reshape([num_test,1]))
  return X_train, Y_train, X_validation, Y_validation, X_test, Y_test

In [47]:
def log_images():
  images = []
  labels = []
  for i in range(len(X_train)):
    if len(labels)==len(classes):
      break
    j = random.randint(0,X_train.shape[1]-1)
    if classes[Y_train[0,j]] not in labels:
      images.append(X_train[:,j].reshape([np.sqrt(X_train.shape[0]),np.sqrt(X_train.shape[0])]))
      labels.append(classes[Y_train[0,j]])
  wandb.log({"Examples": [wandb.Image(img, caption=caption) for img,caption in zip(images,labels)]})

# Parameters initializing, activation, loss functions

Weights initialising

In [None]:
# Random initialization
def initialize_weights_random(N, weight_factor):
  np.random.seed(0)
  W = []
  b = []
  W.append(0)
  b.append(0)
  for i in range(1,l+1):
    W.append((np.random.randn(N[i],N[i-1]))*weight_factor)
    b.append(np.zeros((N[i],1)))
  return W,b 

# Xavier initialization 
def initialize_weights_xavier(N, weight_factor):
  initializer = tf.keras.initializers.GlorotNormal()
  W = []
  b = []
  W.append(0)
  b.append(0)
  for i in range(1,l+1):
    values = initializer(shape=(N[i],N[i-1]))
    W.append(np.array(values)*weight_factor)
    b.append(np.zeros((N[i],1)))
    del values
  return W,b 

# Choosing initialization function
def initialize_weights(N,init, act):
  if act =='relu':
    weight_factor = 10**-2
  else:
    weight_factor = 1
  if init=='random':
    return initialize_weights_random(N, weight_factor)
  if init=='xavier':
    return initialize_weights_xavier(N, weight_factor)

One hot encoding

In [None]:
def one_hot(X,y):
  one_hot_y=np.zeros((10,X.shape[1]))
  for i in range(X.shape[1]):
    one_hot_y[y[0,i],i] = 1
  
  return one_hot_y

Activation functions

In [None]:
# Softmax
def softmax(a):
  return np.exp(a)/sum(np.exp(a))

# Sigmoid 
def sigmoid(a):
  return 1/(1+np.exp(-a))

# Sigmoid derivative
def dSigmoid(a):
  return sigmoid(a)*(1-sigmoid(a))

# Tanh
def tanh(a):
  return np.tanh(a)

# Tanh derivative
def dTanh(a):
  return 1- np.square(tanh(a))

# ReLU
def ReLU(a):
  return np.maximum(0,a)

# ReLU derivative
def dReLU(a):
  a[a>=0]=1
  a[a<0]=0
  return a

# Choosing activation function
def activation(a,act):
  if act=='sigmoid':
    return sigmoid(a)
  if act=='tanh':
    return tanh(a)
  if act=='relu':
    return ReLU(a)

# Choosing derivative of activation function
def dActivation(a,act):
  if act=='sigmoid':
    return dSigmoid(a)
  if act=='tanh':
    return dTanh(a)
  if act=='relu':
    return dReLU(a)

Loss functions with L2 regularization, accuracy function

In [None]:
# Cross entropy loss
def cross_entropy_loss(y_hat,Y, W, alpha):
  loss = 0
  for i in range(Y.shape[1]):
    loss -= np.log(y_hat[Y[0,i],i])
  norm = 0
  for i in range(1, len(W)):
    norm +=np.linalg.norm((W[i]))
  loss = loss +norm
  loss = loss/Y.shape[1]
  return loss

# Cross entropy loss derivative
def d_cross_entropy_loss(y_hat, X, y):
  return (y_hat - one_hot(X,y))

# Mean squared loss
def l2_loss(y_hat,Y,W,alpha):
  loss=(np.linalg.norm(y_hat-Y))/2
  norm = 0
  for i in range(1, len(W)):
    norm +=np.linalg.norm((W[i]))
  loss = loss +norm
  loss = loss/Y.shape[1]
  return loss

# Mean squared loss derivative
def d_l2_loss(y_hat,X, Y):
  return np.multiply((y_hat - one_hot(X,y)),np.multiply(y_hat,(1-y_hat)))

# Choosing loss function
def Loss(y_hat, y, W, alpha, choose_loss='crossentropy'):
  if choose_loss == 'crossentropy':
    return cross_entropy_loss(y_hat,y,W, alpha)
  if choose_loss =='l2':
    return l2_loss(y_hat,y,W, alpha)

# Choosing derivative of loss function
def dLoss(y_hat,X, y, choose_loss='crossentropy'):
  if choose_loss == 'crossentropy':
    return d_cross_entropy_loss(y_hat, X, y)
  if choose_loss =='l2':
    return d_l2_loss(y_hat, X, y)

Calculating accuracy and loss

In [None]:
def accuracy(y_hat,Y):
  return ((y_hat == Y).sum()/Y.shape[1])
 
def loss_and_accuracy(W, b, X, Y, act, alpha):
  y_hat = feedforward(X, W, b, N, l, act)[2]
  y_predict = np.argmax(y_hat,axis=0).reshape([1,X.shape[1]])
  loss = Loss(y_hat, Y, W, alpha)                             # For L2 loss, add 'l2' as an argument in the end
  acc = accuracy(y_predict,Y)
  return loss,acc

# Feedforward and Backpropagation

Feedforward

In [None]:
def feedforward(X_input, W, b, N, l, act):
  a=[]
  a.append(np.array([0]))
  h=[]
  h.append(X_input)
  for i in range(1,l):
    a_new=b[i]+np.matmul(W[i],h[i-1])
    h_new=activation(a_new,act)
    a.append(a_new)
    h.append(h_new)

  a_new=b[l]+np.matmul(W[l],h[l-1])
  y_hat=softmax(a_new)
  a.append(a_new)

  return a,h,y_hat

Backpropogation

In [None]:
def backprop(X, y,y_hat,W,a,h,act,alpha):
  grad_W=[]
  grad_b=[]
  grad_a=[]
  grad_h=[]
  
  da = dLoss(y_hat, X, y) # For L2 loss, add 'l2' as an argument in the end
  grad_a.append(da)

  for i in reversed(range(1,l+1)):
    dw = (np.matmul(da,np.transpose(h[i-1])) + alpha*W[i])/(X.shape[1])
    db = np.sum(da, axis=1, keepdims=True)/(X.shape[1])
    dh_prev = np.matmul(np.transpose(W[i]),da)
    da_prev = dh_prev*dActivation(a[i-1],act)

    grad_W.append(dw)
    grad_b.append(db)
    da=da_prev

  grad_W.append(0)
  grad_b.append(0)
  grad_W.reverse()
  grad_b.reverse() 

  return grad_W,grad_b


# Optimizers

In [None]:
# SGD
def gradient_descent(epochs, learn_rate, batch_size, act, init, N, alpha):
  W, b = initialize_weights(N,init, act)
  n=(np.ceil(X_train.shape[1]/batch_size)).astype(int)
  for i in range(epochs):
    for j in range(n):
      X=X_train[:,(j*batch_size):((j+1)*batch_size)]
      Y=Y_train[:,(j*batch_size):((j+1)*batch_size)]
      a, h, y_hat = feedforward(X, W, b, N, l,act)
      grad_W, grad_b = backprop(X, Y, y_hat, W, a, h, act, alpha)
      W_new =[]
      W_new.append(0)
      b_new =[]
      b_new.append(0)
      for k in range(1,l+1):
        W_new.append(W[k] - (learn_rate*grad_W[k] - learn_rate*alpha*W[k]))
        b_new.append(b[k] - (learn_rate*grad_b[k]))
      W = W_new
      b = b_new
    loss, acc = loss_and_accuracy(W, b, X_train, Y_train, act,alpha)
    val_loss, val_acc = loss_and_accuracy(W, b, X_validation, Y_validation, act,alpha)
    wandb.log({"loss": loss, "val_loss": val_loss, "accuracy": acc, "val_accuracy": val_acc, "epoch": i})
  return W, b


# Momentum based gradient descent
def mom_grad_descent(epochs, learn_rate, gamma, batch_size, act, init, N, alpha):
  W, b = initialize_weights(N,init, act)
  update_prev_w = []
  update_prev_w.append(0)
  update_prev_b = []
  update_prev_b.append(0)
  n=(np.ceil(X_train.shape[1]/batch_size)).astype(int)
  for i in range(1,l+1):
    update_prev_w.append(np.zeros(W[i].shape))
    update_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    for j in range(n):
      X=X_train[:,(j*batch_size):((j+1)*batch_size)]
      Y=Y_train[:,(j*batch_size):((j+1)*batch_size)]
      update_w = []
      update_w.append(0)
      update_b = []
      update_b.append(0)
      a, h, y_hat = feedforward(X, W, b, N, l, act)
      grad_W, grad_b = backprop(X, Y, y_hat, W, a, h, act, alpha)
      W_new =[]
      W_new.append(0)
      b_new =[]
      b_new.append(0)
      
      for k in range(1,l+1):
        update_w.append(np.zeros(W[k].shape))
        update_b.append(np.zeros(b[k].shape))
        update_w[k] = gamma*update_prev_w[k] + learn_rate*grad_W[k] + learn_rate*alpha*W[k]
        update_b[k] = gamma*update_prev_b[k] + learn_rate*grad_b[k]
        W_new.append((W[k] - update_w[k]/batch_size))
        b_new.append((b[k] - update_b[k]/batch_size))
        update_prev_w[k]=update_w[k]
        update_prev_b[k]=update_b[k]
      del update_w
      del update_b
      W = W_new
      b = b_new
    loss, acc = loss_and_accuracy(W, b, X_train, Y_train, act,alpha)
    val_loss, val_acc = loss_and_accuracy(W, b, X_validation, Y_validation, act,alpha)
    wandb.log({"loss": loss, "val_loss": val_loss, "accuracy": acc, "val_accuracy": val_acc, "epoch": i})
  return W, b


# Nesterov accelerated gradient descent
def nesterov_grad_descent(epochs, learn_rate, gamma, batch_size, act, init, N, alpha):
  W, b = initialize_weights(N,init, act)
  update_prev_w = []
  update_prev_w.append(0)
  update_prev_b = []
  update_prev_b.append(0)
  n=(np.ceil(X_train.shape[1]/batch_size)).astype(int)
  for i in range(1,l+1):
    update_prev_w.append(np.zeros(W[i].shape))
    update_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    for j in range(n):
      X=X_train[:,(j*batch_size):((j+1)*batch_size)]
      Y=Y_train[:,(j*batch_size):((j+1)*batch_size)]
      update_w = []
      update_w.append(0)
      update_b = []
      update_b.append(0)
      a, h, y_hat = feedforward(X, W, b, N, l, act)
      W_new =[]
      W_new.append(0)
      b_new =[]
      b_new.append(0)
      for k in range(1,l+1):
        update_w.append(np.zeros(W[k].shape))
        update_b.append(np.zeros(b[k].shape))
        update_w[k] = gamma*update_prev_w[k]
        update_b[k] = gamma*update_prev_b[k]
        W_new.append((W[k] - update_w[k]/batch_size))
        b_new.append((b[k] - update_b[k]/batch_size))
      W = W_new
      b = b_new
      grad_W, grad_b = backprop(X, Y, y_hat, W, a, h, act, alpha)
      for k in range(1,l+1):
        update_w[k] = gamma*update_prev_w[k] + learn_rate*grad_W[k] + learn_rate*alpha*W[k]
        update_b[k] = gamma*update_prev_b[k] + learn_rate*grad_b[k]
        W_new[k] = (W[k] - update_w[k]/batch_size)
        b_new[k] = (b[k] - update_b[k]/batch_size)
        update_prev_w[k]=update_w[k]
        update_prev_b[k]=update_b[k]
      del update_w
      del update_b
      W = W_new
      b = b_new
    loss, acc = loss_and_accuracy(W, b, X_train, Y_train, act,alpha)
    val_loss, val_acc = loss_and_accuracy(W, b, X_validation, Y_validation, act,alpha)
    wandb.log({"loss": loss, "val_loss": val_loss, "accuracy": acc, "val_accuracy": val_acc, "epoch": i})
  return W, b


# RMS prop
def rms_prop(epochs, learn_rate, beta, epsilon, batch_size, act, init, N, alpha):
  W, b = initialize_weights(N,init, act)
  update_prev_w = []
  update_prev_w.append(0)
  update_prev_b = []
  update_prev_b.append(0)
  n=(np.ceil(X_train.shape[1]/batch_size)).astype(int)
  for i in range(1,l+1):
    update_prev_w.append(np.zeros(W[i].shape))
    update_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    for j in range(n):
      X=X_train[:,(j*batch_size):((j+1)*batch_size)]
      Y=Y_train[:,(j*batch_size):((j+1)*batch_size)]
      update_w = []
      update_w.append(0)
      update_b = []
      update_b.append(0)
      a, h, y_hat = feedforward(X, W, b, N, l, act)
      grad_W, grad_b = backprop(X, Y, y_hat, W, a, h, act, alpha)
      W_new =[]
      W_new.append(0)
      b_new =[]
      b_new.append(0)
      for k in range(1,l+1):
        update_w.append(np.zeros(W[k].shape))
        update_b.append(np.zeros(b[k].shape))
        update_w[k] = beta*update_prev_w[k] + (1-beta)*np.square(grad_W[k])
        update_b[k] = beta*update_prev_b[k] + (1-beta)*np.square(grad_b[k])
        W_new.append((W[k] - (learn_rate*np.divide(grad_W[k],(np.sqrt(update_w[k]+epsilon)))) - learn_rate*alpha*W[k]))
        b_new.append((b[k] - (learn_rate*np.divide(grad_b[k],(np.sqrt(update_b[k]+epsilon))))))
        update_prev_w[k]=update_w[k]
        update_prev_b[k]=update_b[k]
      del update_w
      del update_b
      W = W_new
      b = b_new
    loss, acc = loss_and_accuracy(W, b, X_train, Y_train, act,alpha)
    val_loss, val_acc = loss_and_accuracy(W, b, X_validation, Y_validation, act,alpha)
    wandb.log({"loss": loss, "val_loss": val_loss, "accuracy": acc, "val_accuracy": val_acc, "epoch": i})
  return W, b


# Adam
def adam(epochs, learn_rate, beta1, beta2, epsilon, batch_size, act, init, N, alpha):
  W, b = initialize_weights(N,init, act)
  m_prev_w = []
  m_prev_w.append(0)
  m_prev_b = []
  m_prev_b.append(0)
  v_prev_w = []
  v_prev_w.append(0)
  v_prev_b = []
  v_prev_b.append(0)
  n=(np.ceil(X_train.shape[1]/batch_size)).astype(int)
  for i in range(1,l+1):
    m_prev_w.append(np.zeros(W[i].shape))
    m_prev_b.append(np.zeros(b[i].shape))
    v_prev_w.append(np.zeros(W[i].shape))
    v_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    for j in range(n):
      X=X_train[:,(j*batch_size):((j+1)*batch_size)]
      Y=Y_train[:,(j*batch_size):((j+1)*batch_size)]
      m_w = []
      m_w.append(0)
      v_w = []
      v_w.append(0)
      m_b = []
      m_b.append(0)
      v_b = []
      v_b.append(0)
      a, h, y_hat = feedforward(X, W, b, N, l, act)
      grad_W, grad_b = backprop(X, Y, y_hat, W, a, h, act, alpha)
      W_new =[]
      W_new.append(0)
      b_new =[]
      b_new.append(0)
      for k in range(1,l+1):
        m_w.append(np.zeros(W[k].shape))
        v_w.append(np.zeros(W[k].shape))
        m_b.append(np.zeros(b[k].shape))
        v_b.append(np.zeros(b[k].shape))
        t = i*n + j+1
        m_w[k] = (beta1*m_prev_w[k] + (1-beta1)*grad_W[k])/(1-np.power(beta1,t))
        v_w[k] = (beta2*v_prev_w[k] + (1-beta2)*np.square(grad_W[k]))/(1-np.power(beta2,t))
        m_b[k] = (beta1*m_prev_b[k] + (1-beta1)*grad_b[k])/(1-np.power(beta1,t))
        v_b[k] = (beta2*v_prev_b[k] + (1-beta2)*np.square(grad_b[k]))/(1-np.power(beta2,t))
        W_new.append((W[k] - (learn_rate*np.divide(m_w[k],(np.sqrt(v_w[k]+epsilon)))) - learn_rate*alpha*W[k]))
        b_new.append((b[k] - (learn_rate*np.divide(m_b[k],(np.sqrt(v_b[k]+epsilon))))))
        m_prev_w[k] = m_w[k]
        v_prev_w[k] = v_w[k]
        m_prev_b[k] = m_b[k]
        v_prev_b[k] = v_b[k]
      del m_w
      del v_w
      del m_b
      del v_b
      W = W_new
      b = b_new
    loss, acc = loss_and_accuracy(W, b, X_train, Y_train, act,alpha)
    val_loss, val_acc = loss_and_accuracy(W, b, X_validation, Y_validation, act,alpha)
    wandb.log({"loss": loss, "val_loss": val_loss, "accuracy": acc, "val_accuracy": val_acc, "epoch": i})
  return W, b


# Nadam
def nadam(epochs, learn_rate, beta1, beta2, epsilon, batch_size, act, init, N, alpha):
  W, b = initialize_weights(N,init, act)
  m_prev_w = []
  m_prev_w.append(0)
  m_prev_b = []
  m_prev_b.append(0)
  v_prev_w = []
  v_prev_w.append(0)
  v_prev_b = []
  v_prev_b.append(0)
  n=(np.ceil(X_train.shape[1]/batch_size)).astype(int)
  for i in range(1,l+1):
    m_prev_w.append(np.zeros(W[i].shape))
    m_prev_b.append(np.zeros(b[i].shape))
    v_prev_w.append(np.zeros(W[i].shape))
    v_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    for j in range(n):
      X=X_train[:,(j*batch_size):((j+1)*batch_size)]
      Y=Y_train[:,(j*batch_size):((j+1)*batch_size)]
      m_w = []
      m_w.append(0)
      v_w = []
      v_w.append(0)
      m_b = []
      m_b.append(0)
      v_b = []
      v_b.append(0)
      a, h, y_hat = feedforward(X, W, b, N, l, act)
      W_new =[]
      W_new.append(0)
      b_new =[]
      b_new.append(0)
      for k in range(1,l+1):
        m_w.append(np.zeros(W[k].shape))
        v_w.append(np.zeros(W[k].shape))
        m_b.append(np.zeros(b[k].shape))
        v_b.append(np.zeros(b[k].shape))
        t = i*n + j+1
        m_w[k] = (beta1*m_prev_w[k])/(1-np.power(beta1,t))
        v_w[k] = (beta2*v_prev_w[k])/(1-np.power(beta2,t))
        m_b[k] = (beta1*m_prev_b[k])/(1-np.power(beta1,t))
        v_b[k] = (beta2*v_prev_b[k])/(1-np.power(beta2,t))
        W_new.append((W[k] - (learn_rate*np.divide(m_w[k],(np.sqrt(v_w[k]+epsilon))))))
        b_new.append((b[k] - (learn_rate*np.divide(m_b[k],(np.sqrt(v_b[k]+epsilon))))))
      W = W_new
      b = b_new
      grad_W, grad_b = backprop(X, Y, y_hat, W, a, h, act, alpha)
      for k in range(1,l+1):
        m_w[k] = (beta1*m_prev_w[k] + (1-beta1)*grad_W[k])/(1-np.power(beta1,t))
        v_w[k] = (beta2*v_prev_w[k] + (1-beta2)*np.square(grad_W[k]))/(1-np.power(beta2,t))
        m_b[k] = (beta1*m_prev_b[k] + (1-beta1)*grad_b[k])/(1-np.power(beta1,t))
        v_b[k] = (beta2*v_prev_b[k] + (1-beta2)*np.square(grad_b[k]))/(1-np.power(beta2,t))
        W_new[k] = W[k] - (learn_rate*np.divide(m_w[k],(np.sqrt(v_w[k]+epsilon)))) - learn_rate*alpha*W[k]
        b_new[k] = b[k] - (learn_rate*np.divide(m_b[k],(np.sqrt(v_b[k]+epsilon))))
        m_prev_w[k] = m_w[k]
        v_prev_w[k] = v_w[k]
        m_prev_b[k] = m_b[k]
        v_prev_b[k] = v_b[k]
      del m_w
      del v_w
      del m_b
      del v_b
      W = W_new
      b = b_new
    loss, acc = loss_and_accuracy(W, b, X_train, Y_train, act,alpha)
    val_loss, val_acc = loss_and_accuracy(W, b, X_validation, Y_validation, act,alpha)
    wandb.log({"loss": loss, "val_loss": val_loss, "accuracy": acc, "val_accuracy": val_acc, "epoch": i})
  return W, b

# Commands for execution

Loading dataset

In [70]:
X_train, Y_train, X_validation, Y_validation, X_test, Y_test = load_data()

**Question 1**

Logging random images from each class to wandb

In [None]:
log_images()

**Question 2**

Run a random image from the train dataset through the feedforward neural network

In [None]:
N_example = [X_train.shape[0],64,32,16,len(classes)]                            # Structure of the neural network
l_example = len(N_example) - 1                                                  # Number of layers
W, b = initialize_weights (N_example,'random','sigmoid')
num = np.random.randint(0, X_train.shape[1]-1)                                  # Random image number for which the feedforward output will be printed
_,_,y_hat = feedforward(X_train[:, num], W, b, N_example, l_example, 'sigmoid') # Probability of classes as output
print (y_hat)

**Question 4**

Setting up hyperparameter sweeps using wandb

In [None]:
sweep_config = {
    'method': 'grid'
    }


### To run each sweep mentioned in the report, uncomment only the corresponding dictionary

# Parameters for first sweep
parameters_dict = {
    'networks':{
        'values': [0]
      },
    'alpha': {
        'values': [0.0005]
      },
    'learn_rate': {
        'values': [1e-1,1e-2]
      },
    'epochs': {
        'values': [10]
      },
    'batch_size': {
        'values': [32]
      },
    'init': {
        'values': ['random', 'xavier']
      },
    'optimizer': {
        'values': ['sgd', 'mom', 'nest', 'rms', 'adam', 'nadam']
      },
    'act': {
          'values': ['sigmoid','tanh','relu']
      }
}

# Parameters for second sweep
# parameters_dict = {
#     'networks':{
#         'values': [1,2]
#       },
#     'alpha': {
#         'values': [0.0005]
#       },
#     'learn_rate': {
#         'values': [1e-1]
#       },
#     'epochs': {
#         'values': [10]
#       },
#     'batch_size': {
#         'values': [32]
#       },
#     'init': {
#         'values': ['xavier']
#       },
#     'optimizer': {
#         'values': ['mom', 'nest']
#       },
#     'act': {
#           'values': ['tanh']
#       }
# }

# Parameters for third sweep
# parameters_dict = {
#     'networks':{
#         'values': [0,1,2,3,4,5,6,7,8]
#       },
#     'alpha': {
#         'values': [0.0005]
#       },
#     'learn_rate': {
#         'values': [1e-1]
#       },
#     'epochs': {
#         'values': [10]
#       },
#     'batch_size': {
#         'values': [32]
#       },
#     'init': {
#         'values': ['xavier']
#       },
#     'optimizer': {
#         'values': ['mom', 'nest']
#       },
#     'act': {
#           'values': ['tanh']
#       }
# }

# Parameters for fourth sweep
# parameters_dict = {
#     'networks':{
#         'values': [1,2,3,6]
#       },
#     'alpha': {
#         'values': [0.0005, 0.05]
#       },
#     'learn_rate': {
#         'values': [1e-1]
#       },
#     'epochs': {
#         'values': [7,10]
#       },
#     'batch_size': {
#         'values': [16,32]
#       },
#     'init': {
#         'values': ['xavier']
#       },
#     'optimizer': {
#         'values': ['mom', 'nest']
#       },
#     'act': {
#           'values': ['tanh']
#       }
# }

# Parameters for fifth sweep
# parameters_dict = {
#     'networks':{
#         'values': [3,6]
#       },
#     'alpha': {
#         'values': [0.0005]
#       },
#     'learn_rate': {
#         'values': [1e-1,1e-2]
#       },
#     'epochs': {
#         'values': [15]
#       },
#     'batch_size': {
#         'values': [64,32]
#       },
#     'init': {
#         'values': ['xavier']
#       },
#     'optimizer': {
#         'values': ['mom', 'nest','adam']
#       },
#     'act': {
#           'values': ['tanh']
#       }
# }

sweep_config['parameters'] = parameters_dict
pprint.pprint(sweep_config)

def training_sweep(config=None):
    with wandb.init(config=config):
        config = wandb.config
        network = [[X_train.shape[0],64,32,16,len(classes)],
                   [X_train.shape[0],32,32,32,len(classes)],
                   [X_train.shape[0],64,64,64,len(classes)],
                   [X_train.shape[0],128,64,32,16,len(classes)],
                   [X_train.shape[0],32,32,32,32,len(classes)],
                   [X_train.shape[0],64,64,64,64,len(classes)],
                   [X_train.shape[0],256,128,64,32,16,len(classes)],
                   [X_train.shape[0],32,32,32,32,32,len(classes)],
                   [X_train.shape[0],64,64,64,64,64,len(classes)]]
        global N
        N = network[config.networks]
        global l
        l = len(N)-1
        wandb.log({"Structure": N})
        if config.optimizer == 'sgd':
          gradient_descent(config.epochs, config.learn_rate, config.batch_size, config.act, config.init, N, config.alpha)
        if config.optimizer == 'mom':
          mom_grad_descent(config.epochs, config.learn_rate, 0.9, config.batch_size, config.act, config.init, N, config.alpha)
        if config.optimizer == 'nest':
          nesterov_grad_descent(config.epochs, config.learn_rate, 0.9, config.batch_size, config.act, config.init, N, config.alpha)
        if config.optimizer == 'rms':
          rms_prop(config.epochs, config.learn_rate, 0.9, 10**-8,config.batch_size, config.act, config.init, N, config.alpha)
        if config.optimizer == 'adam':
          adam(config.epochs, config.learn_rate, 0.9, 0.99, 10**-8,config.batch_size, config.act, config.init, N, config.alpha)
        if config.optimizer == 'nadam':
          nadam(config.epochs, config.learn_rate, 0.9, 0.99, 10**-8,config.batch_size, config.act, config.init, N, config.alpha)

Running the hyperparameter sweep

In [None]:
sweep_id = wandb.sweep(sweep_config, project="CS6910-assg1")
wandb.agent(sweep_id, training_sweep)

**Question 7**

Plotting the confusion matrix on wandb

In [None]:
y_hat = feedforward(X_test, W, b, N, l, act)[2]
y_predict = np.argmax(y_hat,axis=0).reshape([1,X.shape[1]])
wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None, y_true=Y_true, preds=y_predict,class_names=classes})

**Question 8**

Squared error loss for the best model

(Modify arguments passed to the *Loss* and *dLoss* functions in the *backprop* and *loss_and_accuracy* functions)

In [None]:
# Parameters for the best model
parameters_dict = {
    'networks':{
        'values': [3,6]
      },
    'alpha': {
        'values': [0.0005]
      },
    'learn_rate': {
        'values': [1e-1,1e-2]
      },
    'epochs': {
        'values': [15]
      },
    'batch_size': {
        'values': [64,32]
      },
    'init': {
        'values': ['xavier']
      },
    'optimizer': {
        'values': ['mom', 'nest','adam']
      },
    'act': {
          'values': ['tanh']
      }
}
sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project="CS6910-assg1")
wandb.agent(sweep_id, training_sweep)

**Question 10**

Loading and running three hyperparameter models for the MNIST dataset

In [None]:
X_train_mnist, Y_train_mnist, X_validation_mnist, Y_validation_mnist, X_test_mnist, Y_test_mnist = load_data('mnist')

# Parameters for the mnist dataset
parameters_dict = {
    'networks':{
        'values': [3,6]
      },
    'alpha': {
        'values': [0.0005]
      },
    'learn_rate': {
        'values': [1e-1,1e-2]
      },
    'epochs': {
        'values': [15]
      },
    'batch_size': {
        'values': [64,32]
      },
    'init': {
        'values': ['xavier']
      },
    'optimizer': {
        'values': ['mom', 'nest','adam']
      },
    'act': {
          'values': ['tanh']
      }
}
sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project="CS6910-assg1")
wandb.agent(sweep_id, training_sweep)