<a href="https://colab.research.google.com/github/Swathi1309/ED18B034_ME18B133_Assignment1/blob/main/Assignment1_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb
!wandb login

In [None]:
import keras
from keras.datasets import fashion_mnist
from PIL import Image
import wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

wandb.init(project="assignment-1", entity="swathi")
classes = ['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle boot']

In [6]:
def load_data():
  (x_train,y_train),(x_test,y_test) = fashion_mnist.load_data()
  X_train = np.transpose((np.array(x_train[:54000],dtype=int).reshape([54000,784]))/255)
  Y_train = np.transpose((np.array(y_train[:54000],dtype=int).reshape([54000,1])))
  X_validation = np.transpose((np.array(x_train[54000:],dtype=int).reshape([6000,784]))/255)
  Y_validation = np.transpose(np.array(y_train[54000:],dtype=int).reshape([6000,1]))
  X_test = np.transpose((np.array(x_test,dtype=int).reshape([10000,784]))/255)
  Y_test = np.transpose(np.array(y_test,dtype=int).reshape([10000,1]))
  return X_train, Y_train, X_validation, Y_validation, X_test, Y_test

In [7]:
def log_images():
  images = []
  labels = []
  for i in range(len(X_train)):
    if len(labels)==len(classes):
      break
    j = random.randint(0,53999)
    if classes[Y_train[0,j]] not in labels:
      images.append(X_train[:,j].reshape([28,28]))
      labels.append(classes[Y_train[0,j]])
  wandb.log({"Examples": [wandb.Image(img, caption=caption) for img,caption in zip(images,labels)]})

Loading data

In [None]:
X_train, Y_train, X_validation, Y_validation, X_test, Y_test = load_data()

In [9]:
log_images()

Defining basic parameters

In [10]:
N= [0,64,32,16,10]  #Insert the number of neurons in each layer
N[0]=X_train.shape[0]
l = len(N) - 1

Weights initialising

In [64]:
def initialize_weights(N):
  np.random.seed(1)
  W = []
  b = []
  W.append(0)
  b.append(0)
  for i in range(1,l+1):
    W.append((np.random.randn(N[i],N[i-1])))
    b.append(np.zeros((N[i],1)))

  return W,b 

In [12]:
def one_hot(y):
  one_hot_y=np.zeros((10,X_train.shape[1]))
  for i in range(X_train.shape[1]):
    one_hot_y[y[0,i],i] = 1
  
  return one_hot_y

Softmax

In [13]:
def softmax(a):
  return np.exp(a)/sum(np.exp(a))

Sigmoid

In [14]:
def sigmoid(a):

  return 1/(1+np.exp(-a))


Sigmoid derivative

In [15]:
def dSigmoid(a):

  return sigmoid(a)*(1-sigmoid(a))

Cross Entropy

In [16]:
def calculate_loss(y_hat,Y):
  loss = 0
  for i in range(Y.shape[1]):
    loss -= np.log(y_hat[Y[0,i],i])
  print (loss)

Feed forward network

In [17]:
def feedforward(X_input, W, b, N, l):
  a=[]
  a.append(0)
  h=[]
  h.append(X_input)
  for i in range(1,l):
    a_new=b[i]+np.matmul(W[i],h[i-1])
    h_new=sigmoid(a_new)
    a.append(a_new)
    h.append(h_new)

  a_new=b[l]+np.matmul(W[l],h[l-1])
  y_hat=softmax(a_new)
  a.append(a_new)

  return a,h,y_hat

Backpropogation

In [18]:
def backrop(y,y_hat,W,a,h):
  grad_W=[]
  grad_b=[]
  grad_a=[]
  grad_h=[]
  
  da = y_hat - one_hot(y)
  grad_a.append(da)

  for i in reversed(range(1,l+1)):
    dw = np.matmul(da,np.transpose(h[i-1]))
    db = np.sum(da, axis=1, keepdims=True)
    dh_prev = np.matmul(np.transpose(W[i]),da)
    da_prev = dh_prev*dSigmoid(a[i-1])
    
    grad_W.append(dw)
    grad_b.append(db)
    grad_a.append(da_prev)
    grad_h.append(dh_prev)
    da=da_prev

  grad_W.append(0)
  grad_b.append(0)
  grad_W.reverse()
  grad_b.reverse()
  grad_a.reverse()
  grad_h.reverse()
  

  return grad_W,grad_b,grad_a,grad_h


Gradient Descent

In [20]:
def gradient_descent(epochs, learn_rate):
  W, b = initialize_weights(N)
  for i in range(epochs):
    a, h, y_hat = feedforward(X_train, W, b, N, l)
    calculate_loss(y_hat, Y_train)
    grad_W, grad_b = backrop(Y_train, y_hat, W, a, h)[0:2]
    W_new =[]
    W_new.append(0)
    b_new =[]
    b_new.append(0)
    m=X_train.shape[1]
    for j in range(1,l+1):
      W_new.append(W[j] - (learn_rate*grad_W[j])/m)
      b_new.append(b[j] - (learn_rate*grad_b[j])/m)
    W = W_new
    b = b_new
  return W, b

Momentum based gradient descent

In [22]:
def mom_grad_descent(epochs, learn_rate, gamma):
  W, b = initialize_weights(N)
  update_prev_w = []
  update_prev_w.append(0)
  update_prev_b = []
  update_prev_b.append(0)
  for i in range(1,l+1):
    update_prev_w.append(np.zeros(W[i].shape))
    update_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    update_w = []
    update_w.append(0)
    update_b = []
    update_b.append(0)
    a, h, y_hat = feedforward(X_train, W, b, N, l)
    calculate_loss(y_hat, Y_train)
    grad_W, grad_b = backrop(Y_train, y_hat, W, a, h)[0:2]
    W_new =[]
    W_new.append(0)
    b_new =[]
    b_new.append(0)
    m=X_train.shape[1]
    for j in range(1,l+1):
      update_w.append(np.zeros(W[j].shape))
      update_b.append(np.zeros(b[j].shape))
      update_w[j] = gamma*update_prev_w[j] + learn_rate*grad_W[j]
      update_b[j] = gamma*update_prev_b[j] + learn_rate*grad_b[j]
      W_new.append((W[j] - update_w[j])/m)
      b_new.append((b[j] - update_b[j])/m)
      update_prev_w[j]=update_w[j]
      update_prev_b[j]=update_b[j]
    del update_w
    del update_b
    W = W_new
    b = b_new
  return W, b

Nesterov accelerated gradient descent

In [39]:
def nesterov_grad_descent(epochs, learn_rate, gamma):
  W, b = initialize_weights(N)
  update_prev_w = []
  update_prev_w.append(0)
  update_prev_b = []
  update_prev_b.append(0)
  for i in range(1,l+1):
    update_prev_w.append(np.zeros(W[i].shape))
    update_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    update_w = []
    update_w.append(0)
    update_b = []
    update_b.append(0)
    a, h, y_hat = feedforward(X_train, W, b, N, l)
    calculate_loss(y_hat, Y_train)
    W_new =[]
    W_new.append(0)
    b_new =[]
    b_new.append(0)
    m=X_train.shape[1]
    for j in range(1,l+1):
      update_w.append(np.zeros(W[j].shape))
      update_b.append(np.zeros(b[j].shape))
      update_w[j] = gamma*update_prev_w[j]
      update_b[j] = gamma*update_prev_b[j]
      W_new.append((W[j] - update_w[j])/m)
      b_new.append((b[j] - update_b[j])/m)
    W = W_new
    b = b_new
    grad_W, grad_b = backrop(Y_train, y_hat, W, a, h)[0:2]
    for j in range(1,l+1):
      update_w[j] = gamma*update_prev_w[j] + learn_rate*grad_W[j]
      update_b[j] = gamma*update_prev_b[j] + learn_rate*grad_b[j]
      W_new.append((W[j] - update_w[j])/m)
      b_new.append((b[j] - update_b[j])/m)
      update_prev_w[j]=update_w[j]
      update_prev_b[j]=update_b[j]
    del update_w
    del update_b
    W = W_new
    b = b_new
  return W, b

RMS Prop

In [43]:
def rms_prop(epochs, learn_rate, beta, epsilon):
  W, b = initialize_weights(N)
  update_prev_w = []
  update_prev_w.append(0)
  update_prev_b = []
  update_prev_b.append(0)
  for i in range(1,l+1):
    update_prev_w.append(np.zeros(W[i].shape))
    update_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    update_w = []
    update_w.append(0)
    update_b = []
    update_b.append(0)
    a, h, y_hat = feedforward(X_train, W, b, N, l)
    calculate_loss(y_hat, Y_train)
    grad_W, grad_b = backrop(Y_train, y_hat, W, a, h)[0:2]
    W_new =[]
    W_new.append(0)
    b_new =[]
    b_new.append(0)
    m=X_train.shape[1]
    for j in range(1,l+1):
      update_w.append(np.zeros(W[j].shape))
      update_b.append(np.zeros(b[j].shape))
      update_w[j] = beta*update_prev_w[j] + (1-beta)*np.square(grad_W[j])
      update_b[j] = beta*update_prev_b[j] + (1-beta)*np.square(grad_b[j])
      W_new.append((W[j] - (learn_rate*np.divide(grad_W[j],(np.sqrt(update_w[j]+epsilon))))))
      b_new.append((b[j] - (learn_rate*np.divide(grad_b[j],(np.sqrt(update_b[j]+epsilon))))))
      update_prev_w[j]=update_w[j]
      update_prev_b[j]=update_b[j]
    del update_w
    del update_b
    W = W_new
    b = b_new
  return W, b

In [56]:
def adam(epochs, learn_rate, beta1, beta2, epsilon):
  W, b = initialize_weights(N)
  m_prev_w = []
  m_prev_w.append(0)
  m_prev_b = []
  m_prev_b.append(0)
  v_prev_w = []
  v_prev_w.append(0)
  v_prev_b = []
  v_prev_b.append(0)
  for i in range(1,l+1):
    m_prev_w.append(np.zeros(W[i].shape))
    m_prev_b.append(np.zeros(b[i].shape))
    v_prev_w.append(np.zeros(W[i].shape))
    v_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    m_w = []
    m_w.append(0)
    v_w = []
    v_w.append(0)
    m_b = []
    m_b.append(0)
    v_b = []
    v_b.append(0)
    a, h, y_hat = feedforward(X_train, W, b, N, l)
    calculate_loss(y_hat, Y_train)
    grad_W, grad_b = backrop(Y_train, y_hat, W, a, h)[0:2]
    W_new =[]
    W_new.append(0)
    b_new =[]
    b_new.append(0)
    m=X_train.shape[1]
    for j in range(1,l+1):
      m_w.append(np.zeros(W[j].shape))
      v_w.append(np.zeros(W[j].shape))
      m_b.append(np.zeros(b[j].shape))
      v_b.append(np.zeros(b[j].shape))
      m_w[j] = (beta1*m_prev_w[j] + (1-beta1)*grad_W[j])/(1-(beta1**(i+1)))
      v_w[j] = (beta2*v_prev_w[j] + (1-beta2)*np.square(grad_W[j]))/(1-(beta2**(i+1)))
      m_b[j] = (beta1*m_prev_b[j] + (1-beta1)*grad_b[j])/(1-beta1**(i+1))
      v_b[j] = (beta2*v_prev_b[j] + (1-beta2)*np.square(grad_b[j]))/(1-beta2**(i+1))
      W_new.append((W[j] - (learn_rate*np.divide(m_w[j],(np.sqrt(v_w[j]+epsilon)))))/m)
      b_new.append((b[j] - (learn_rate*np.divide(m_b[j],(np.sqrt(v_b[j]+epsilon)))))/m)
      m_prev_w[j] = m_w[j]
      v_prev_w[j] = v_w[j]
      m_prev_b[j] = m_b[j]
      v_prev_b[j] = v_b[j]
    del m_w
    del v_w
    del m_b
    del v_b
    W = W_new
    b = b_new
  return W, b

In [55]:
def nadam(epochs, learn_rate, beta1, beta2, epsilon):
  W, b = initialize_weights(N)
  m_prev_w = []
  m_prev_w.append(0)
  m_prev_b = []
  m_prev_b.append(0)
  v_prev_w = []
  v_prev_w.append(0)
  v_prev_b = []
  v_prev_b.append(0)
  for i in range(1,l+1):
    m_prev_w.append(np.zeros(W[i].shape))
    m_prev_b.append(np.zeros(b[i].shape))
    v_prev_w.append(np.zeros(W[i].shape))
    v_prev_b.append(np.zeros(b[i].shape))
  for i in range(epochs):
    m_w = []
    m_w.append(0)
    v_w = []
    v_w.append(0)
    m_b = []
    m_b.append(0)
    v_b = []
    v_b.append(0)
    a, h, y_hat = feedforward(X_train, W, b, N, l)
    calculate_loss(y_hat, Y_train)
    #grad_W, grad_b = backrop(Y_train, y_hat, W, a, h)[0:2]
    W_new =[]
    W_new.append(0)
    b_new =[]
    b_new.append(0)
    m=X_train.shape[1]
    for j in range(1,l+1):
      m_w.append(np.zeros(W[j].shape))
      v_w.append(np.zeros(W[j].shape))
      m_b.append(np.zeros(b[j].shape))
      v_b.append(np.zeros(b[j].shape))
      m_w[j] = (beta1*m_prev_w[j])/(1-(beta1**(i+1))) #+ (1-beta1)*grad_W[j])/(1-(beta1**(i+1)))
      v_w[j] = (beta2*v_prev_w[j])/(1-(beta2**(i+1))) #+ (1-beta2)*np.square(grad_W[j]))/(1-(beta2**(i+1)))
      m_b[j] = (beta1*m_prev_b[j])/(1-beta1**(i+1)) #+ (1-beta1)*grad_b[j])/(1-beta1**(i+1))
      v_b[j] = (beta2*v_prev_b[j])/(1-beta2**(i+1)) #+ (1-beta2)*np.square(grad_b[j]))/(1-beta2**(i+1))
      W_new.append((W[j] - (learn_rate*np.divide(m_w[j],(np.sqrt(v_w[j]+epsilon)))))/m)
      b_new.append((b[j] - (learn_rate*np.divide(m_b[j],(np.sqrt(v_b[j]+epsilon)))))/m)
    W = W_new
    b = b_new
    grad_W, grad_b = backrop(Y_train, y_hat, W, a, h)[0:2]
    for i in range(1,l+1):
      m_w[j] = (beta1*m_prev_w[j] + (1-beta1)*grad_W[j])/(1-(beta1**(i+1)))
      v_w[j] = (beta2*v_prev_w[j] + (1-beta2)*np.square(grad_W[j]))/(1-(beta2**(i+1)))
      m_b[j] = (beta1*m_prev_b[j] + (1-beta1)*grad_b[j])/(1-beta1**(i+1))
      v_b[j] = (beta2*v_prev_b[j] + (1-beta2)*np.square(grad_b[j]))/(1-beta2**(i+1))
      W_new.append((W[j] - (learn_rate*np.divide(m_w[j],(np.sqrt(v_w[j]+epsilon)))))/m)
      b_new.append((b[j] - (learn_rate*np.divide(m_b[j],(np.sqrt(v_b[j]+epsilon)))))/m)
      m_prev_w[j] = m_w[j]
      v_prev_w[j] = v_w[j]
      m_prev_b[j] = m_b[j]
      v_prev_b[j] = v_b[j]
    del m_w
    del v_w
    del m_b
    del v_b
    W = W_new
    b = b_new
  return W, b

In [65]:
W,b = gradient_descent(10,0.001)

237097.25797024355
236504.65984101634
235915.35319173572
235329.34641918368
234746.64739842658
234167.26348595432
233591.20152293865
233018.46783869923
232449.06825427685
231883.00808620665


In [66]:
W, b = mom_grad_descent(10,0.001,0.9)

237097.25797024355
124339.75983499788
124339.74639880835
124339.7280160919
124339.71197558675
124339.69794780001
124339.68565384808
124339.67485794003
124339.66535917204
124339.65698696091


In [67]:
W, b = nesterov_grad_descent(10, 0.001,0.9)

237097.25797024355
124339.58899904974
124339.74714389283
124339.72883288187
124339.71276140434
124339.69870684533
124339.68638983679
124339.67557375261
124339.66605760994
124339.65767029299


In [68]:
W, b = rms_prop(10,0.001,0.9,10**-8)

237097.25797024355
214830.29208012726
201936.1924680934
193211.79765857002
186902.9557671088
182119.50637335723
178339.09890728793
175232.86178824556
172593.21492231012
170288.68858802624


In [69]:
W, b = adam(10,0.001,0.9,0.99, 10**-8)

237097.25797024355
124339.58901029307
124339.59502887553
124339.59502579755
124339.59502381136
124339.5950227172
124339.5950221481
124339.59502194778
124339.59502172102
124339.59502166658


In [70]:
W, b = nadam(10, 0.001,0.9,0.99,10**-8)

237097.25797024355
124339.58899904974
124339.59502261173
124339.5950217716
124339.59502175263
124339.59502154215
124339.5950217865
124339.59502177621
124339.59502177626
124339.59502177626


In [71]:
def grad_descent(W,b,grad_W,grad_b,learn_rate=0.001):
  W_new=[]
  W_new.append(0)
  b_new=[]
  b_new.append(0)
  m=X_train.shape[1]
  for i in range(1,l+1):
    #print(W[i].shape,grad_W[i].shape,b[i].shape,grad_b[i].shape)
    W_new.append(W[i] - (learn_rate*grad_W[i])/m)
    b_new.append(b[i] - (learn_rate*grad_b[i])/m)
  return W_new,b_new


Training

In [64]:
def training(N,l,epochs,batch_size,learn_rate,X_input,y):
  W , b = initialize_weights(N)
  n=(np.floor(X_input.shape[1]/batch_size)+1).astype(int)
  for j in range(epochs):
    for j in range(n):
      X=X_input[:,[j*batch_size,(j+1)*batch_size]]
      a , h , y_hat = feedforward(X_input, W, b, N, l) 
      grad_W , grad_b = backrop(y,y_hat,W,a,h)[0:2]
      W , b = grad_descent(W,b,grad_W,grad_b,learn_rate)
    
    calculate_loss(y_hat,y)
  
  return W,b


In [None]:
def predict(W,b,X_test,N,l):
  y_hat = feedforward(X_test, W, b, N, l)[2]
  y_class = np.argmax(y_hat,axis=0).reshape([1,X_test.shape[1]])

  return y_class

In [None]:
Y_predict = predict(W_new,b_new,X_train,N,l)
print (Y_predict[0,0:4])
print (Y_test[0,0:4])
#sklearn.metrics.accuracy_score(Y_test, Y_predict)

[6 6 7 7]
[9 2 1 1]


In [None]:
sweep_config = {
    'method': 'random'
    }

In [None]:
parameters_dict = {
    'epochs': {
        'values': [5, 10]
      },
    'l': {
        'values': [3, 4, 5]
      },
    'size of each layer': {
        'values': [32, 64, 128]
      },
    'weight decay': {
        'values': [0, 0.0005, 0.5]
      },
    'learn_rate': {
        'values': [1e-3,1e-4]
      },
    'epochs': {
        'values': [5,10]
      },
    'batch_size': {
        'values': [16,32,64]
      },
    'weights_initialization': {
        'values': ['random', 'Xavier']
      },
    'optimizer': {
        'values': ['sgd', 'mom', 'nest', 'rms', 'adam', 'nadam']
      },
    'act_func': {
          'values': ['sigmoid', 'tanh', 'ReLU']
      }
    }

sweep_config['parameters'] = parameters_dict

In [None]:
import pprint

pprint.pprint(sweep_config)

In [None]:
sweep_id = wandb.sweep(sweep_config, project="pytorch-sweeps-demo")

In [None]:
wandb.agent(sweep_id, train, count=5)

In [None]:
def training_sweep(N,l,X_input,y,config=None):
  
  # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        training(N,l,config.epochs,config.batch_size,config.learn_rate,X_input,y)
            wandb.log({"loss": avg_loss, "epoch": epoch})