In [1]:
#####Note:run in python2######
# same initialization of weights because same random seed used

# OBSERVATIONS
# The RMSProp modiﬁes AdaGrad to perform better in the nonconvex setting by 
# changing the gradient accumulation into an exponentially weighted moving average.

# RMSProp uses an exponentially decaying average to discard history from the extreme past so that
# it can converge rapidly after ﬁnding a convex bowl, as if it were an instance of the
# AdaGrad algorithm initialized within that bowl

import math
import numpy as np
import sys
from PIL import Image
import matplotlib.pyplot as plt 

#for shuffling data
from sklearn.utils import shuffle
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

np.random.seed(42)

In [2]:
#########
#MLP
  
# initialisation of the weights he_normal
def weights(noHiddenLayers,sizeOfLayers):

    W=[]
    b=[]

    for i in range(0,noHiddenLayers+1):
        
        W.append( np.random.normal(1e-4,np.sqrt(2*1.0/(sizeOfLayers[i+1]+sizeOfLayers[i])),
                                   (sizeOfLayers[i+1],sizeOfLayers[i])) )
        b.append( np.random.normal(1e-4,np.sqrt(2*1.0/(sizeOfLayers[i+1]+1)),
                                   (sizeOfLayers[i+1],1)) )

    W=np.array(W)
    b=np.array(b)
    
    return W,b

#mlp forward pass
#layer
def layer(w,x,b):
    out = np.matmul(w,x)+b
    return out

def apply_activationMLP(Activation_function,inp):
    
    #activation functions
    if Activation_function == 'relu':
        return np.where(inp<0,0,inp)
    elif Activation_function == "tanh":
        return np.tanh(inp)
    elif Activation_function == "sigmoid":
        return 1.0/(1+np.exp(-1.0*inp))
    elif Activation_function == "softmax":
        return (1.0/(np.sum(np.exp(inp),axis=0)))*(np.exp(inp))

#forward path
def forward_path(noHiddenLayers,X,W,b,Actfnvect):

    out=[]
    
    z=apply_activationMLP(Actfnvect[0],np.array(layer(W[0],X,b[0])))
    out.append(np.array(z))

    for i in range(1,noHiddenLayers):
        z=apply_activationMLP(Actfnvect[i],np.array(layer(W[i],out[i-1],b[i])))
        out.append(np.array(z))

    if noHiddenLayers > 0:
        z=apply_activationMLP(Actfnvect[-1],np.array(layer(W[-1],out[-1],b[-1])))
        out.append(np.array(z))

    y_pred = out[-1]

    return np.array(out),np.array(y_pred)


In [3]:
#only to import data
from keras import backend as K
import keras
from keras.datasets import mnist


# #import data
iris_data = load_iris() # load the iris dataset

x = iris_data.data
y_ = iris_data.target.reshape(-1, 1) # Convert data to a single column

# One Hot encode the class labels
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y_)
# Split the data for training and testing
x_train , x_test, y_train , y_test = train_test_split(x, y, test_size=0.20)

print(x_train.shape)
print(y_train.shape)

Using TensorFlow backend.


(120, 4)
(120, 3)


In [4]:
# run MLP algorithm

x_train = np.moveaxis(x_train,0,-1)
y_train = np.moveaxis(y_train,0,-1)
x_test = np.moveaxis(x_test,0,-1)
y_test = np.moveaxis(y_test,0,-1)

print x_train.shape
print y_train.shape
print x_test.shape
print y_test.shape

(4, 120)
(3, 120)
(4, 30)
(3, 30)


In [5]:
#derivative of relu
def der_relu(x):
    return np.where(x == 0,0,1)

# backpropation
def backprop(y,y_true,z,T,u,M,x):
    
    dy = (y-y_true)
    
    # layer 3
    dT  = np.matmul(dy,z.T)
    db3 = np.sum(dy,axis=1).reshape(-1,1)
    
    # layer 2
    s   = np.matmul(T.T,dy)
    s   = s*der_relu(z)
    dM  = np.matmul(s,u.T)
    db2 = np.sum(s,axis=1).reshape(-1,1)
    
    # layer 1
    sm  = np.matmul(M.T,s)
    sm  = sm*der_relu(u)
    dW  = np.matmul(sm,x.T)
    db1 = np.sum(sm,axis=1).reshape(-1,1)
    
    return dW,dM,dT,db1,db2,db3

In [6]:
###
################################
# Training parameters
num_classes = 3
epochs = 160
rho1 = 0.9
rho2 = 0.99 
batch_size = 10
learning_rate = 1e-3
delta=1e-8
################################

#MLP PARAMETERS
noHiddenLayers=2

#also includes the input vector dimension and output vector dimension
sizeOfLayers=[x_train.shape[0],10,10,num_classes]

sizeofOutput=num_classes

Actfnvect = ['relu','relu','softmax']

In [7]:
W,b = weights(noHiddenLayers,sizeOfLayers)

In [8]:
#initialisation

#first moment
s = {"W[0]" : np.zeros(W[0].shape) , "W[1]" : np.zeros(W[1].shape) ,"W[2]" : np.zeros(W[2].shape),"b[0]": np.zeros(b[0].shape)
     ,"b[1]" :np.zeros(b[1].shape) , "b[2]" : np.zeros(b[2].shape)}

#second moment
r = {"W[0]" : np.zeros(W[0].shape) , "W[1]" : np.zeros(W[1].shape) ,"W[2]" : np.zeros(W[2].shape),"b[0]": np.zeros(b[0].shape)
     ,"b[1]" :np.zeros(b[1].shape) , "b[2]" : np.zeros(b[2].shape)}

#first moment Correct bias
sHat = {"W[0]" : np.zeros(W[0].shape) , "W[1]" : np.zeros(W[1].shape) ,"W[2]" : np.zeros(W[2].shape),"b[0]": np.zeros(b[0].shape)
     ,"b[1]" :np.zeros(b[1].shape) , "b[2]" : np.zeros(b[2].shape)}

#second moment Correct bias
rHat = {"W[0]" : np.zeros(W[0].shape) , "W[1]" : np.zeros(W[1].shape) ,"W[2]" : np.zeros(W[2].shape),"b[0]": np.zeros(b[0].shape)
     ,"b[1]" :np.zeros(b[1].shape) , "b[2]" : np.zeros(b[2].shape)}
#time
t=0


In [9]:
# (a)
# ADAM descent

# updating the weights


for i in range(epochs):
    loss=0
    for j in np.arange(0,x_train.shape[1],batch_size):       
        dW = np.zeros(W[0].shape)
        dM = np.zeros(W[1].shape)
        dT = np.zeros(W[2].shape)
        db1 = np.zeros(b[0].shape)
        db2 = np.zeros(b[1].shape)
        db3 = np.zeros(b[2].shape)
        
        for k in range(0,batch_size):
            # forward pass
            x = x_train[:,j+k].reshape(-1,1)
            y = y_train[:,j+k].reshape(-1,1)
            out,y_pred=forward_path(noHiddenLayers,x,W,b,Actfnvect)
            # backpropagation
            dWtemp,dMtemp,dTtemp,db1temp,db2temp,db3temp=backprop(y_pred,y,out[1],W[-1],out[0],W[1],x)
            
            dW=dW+dWtemp
            db1=db1+db1temp
            
            dM=dM+dMtemp
            db2=db2+db2temp
            
            dT=dT+dTtemp
            db3=db3+db3temp
        
            # calculate the loss
            loss = loss + (-1.0*np.sum(y*np.log(y_pred)))
            
        # Updating the weights using ADAM Approach
        
        #Normalising the weights as in deep learning text book
        dW=dW*(1.0/batch_size)
        dM=dM*(1.0/batch_size)
        dT=dT*(1.0/batch_size)
        db1=db1*(1.0/batch_size)
        db2=db2*(1.0/batch_size)
        db3=db3*(1.0/batch_size)
        
        #time update
        t=t+1
        
        #Update biased first moment estimate
        s["W[0]"] = rho1*s["W[0]"] + ((1.0-rho1)*(dW))
        s["W[1]"] = rho1*s["W[1]"] + ((1.0-rho1)*(dM))
        s["W[2]"] = rho1*s["W[2]"] + ((1.0-rho1)*(dT))
        s["b[0]"] = rho1*s["b[0]"] + ((1.0-rho1)*(db1))
        s["b[1]"] = rho1*s["b[1]"] + ((1.0-rho1)*(db2))
        s["b[2]"] = rho1*s["b[2]"] + ((1.0-rho1)*(db3))
        
        #Update biased second moment estimate
        r["W[0]"] = rho2*r["W[0]"] + ((1.0-rho2)*(dW*dW))
        r["W[1]"] = rho2*r["W[1]"] + ((1.0-rho2)*(dM*dM))
        r["W[2]"] = rho2*r["W[2]"] + ((1.0-rho2)*(dT*dT))
        r["b[0]"] = rho2*r["b[0]"] + ((1.0-rho2)*(db1*db1))
        r["b[1]"] = rho2*r["b[1]"] + ((1.0-rho2)*(db2*db2))
        r["b[2]"] = rho2*r["b[2]"] + ((1.0-rho2)*(db3*db3))
        
        #Correct bias in ﬁrst moment
        sHat["W[0]"] = (1.0/(1-(rho1**t)))*s["W[0]"] 
        sHat["W[1]"] = (1.0/(1-(rho1**t)))*s["W[1]"] 
        sHat["W[2]"] = (1.0/(1-(rho1**t)))*s["W[2]"] 
        sHat["b[0]"] = (1.0/(1-(rho1**t)))*s["b[0]"]
        sHat["b[1]"] = (1.0/(1-(rho1**t)))*s["b[1]"]
        sHat["b[2]"] = (1.0/(1-(rho1**t)))*s["b[2]"]
        
        #Correct bias in second moment
        rHat["W[0]"] = (1.0/(1-(rho2**t)))*r["W[0]"] 
        rHat["W[1]"] = (1.0/(1-(rho2**t)))*r["W[1]"] 
        rHat["W[2]"] = (1.0/(1-(rho2**t)))*r["W[2]"] 
        rHat["b[0]"] = (1.0/(1-(rho2**t)))*r["b[0]"]
        rHat["b[1]"] = (1.0/(1-(rho2**t)))*r["b[1]"]
        rHat["b[2]"] = (1.0/(1-(rho2**t)))*r["b[2]"]
        
        #Apply update
        W[0] = W[0] + (((-1.0*learning_rate)*sHat["W[0]"])/(np.sqrt(rHat["W[0]"])+delta))
        W[1] = W[1] + (((-1.0*learning_rate)*sHat["W[1]"])/(np.sqrt(rHat["W[1]"])+delta))
        W[2] = W[2] + (((-1.0*learning_rate)*sHat["W[2]"])/(np.sqrt(rHat["W[2]"])+delta))
        b[0] = b[0] + (((-1.0*learning_rate)*sHat["b[0]"])/(np.sqrt(rHat["b[0]"])+delta))
        b[1] = b[1] + (((-1.0*learning_rate)*sHat["b[1]"])/(np.sqrt(rHat["b[1]"])+delta))
        b[2] = b[2] + (((-1.0*learning_rate)*sHat["b[2]"])/(np.sqrt(rHat["b[2]"])+delta))
    
    #print the loss in each epoch
    print('Epoch:'+str(i)+'         Loss:'+str(loss))

Epoch:0         Loss:248.85349351697045
Epoch:1         Loss:224.25902089445546
Epoch:2         Loss:199.49114886564348
Epoch:3         Loss:173.167903702095
Epoch:4         Loss:149.84072004549512
Epoch:5         Loss:128.92960991876973
Epoch:6         Loss:110.58980303429249
Epoch:7         Loss:94.0167578756742
Epoch:8         Loss:79.14080038940143
Epoch:9         Loss:68.36339538689423
Epoch:10         Loss:61.38883039107031
Epoch:11         Loss:56.66372176278357
Epoch:12         Loss:53.25247088148282
Epoch:13         Loss:50.69986310188742
Epoch:14         Loss:48.55196963760715
Epoch:15         Loss:46.728373328566576
Epoch:16         Loss:45.11290011442958
Epoch:17         Loss:43.70916023999247
Epoch:18         Loss:42.412867863824616
Epoch:19         Loss:41.222429173683174
Epoch:20         Loss:40.099026270359296
Epoch:21         Loss:39.04807196927103
Epoch:22         Loss:38.01842704136115
Epoch:23         Loss:37.01177251641953
Epoch:24         Loss:35.935887739852184
E

In [10]:
out,y_pred=forward_path(noHiddenLayers,x_test,W,b,Actfnvect)

In [11]:
def predict(y):
    return np.argmax(y)

yvect=[]
y_trurevect=[]

for i in range(0,x_test.shape[1]):
    yvect.append(predict(y_pred[:,i]))
    y_trurevect.append(predict(y_test[:,i]))

# find accuracy
from sklearn.metrics import accuracy_score
#predicting test accuracy
print(accuracy_score(y_trurevect, yvect))

1.0


In [12]:
# to see the output vs true values

print y_trurevect
print yvect

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]
[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]
