In [1]:
#import the libraries 

import pandas as pd
import numpy as np

In [2]:
#load the training dataset

df=pd.read_csv('sample_data/mnist_train_small.csv')

In [3]:
#split df into feature set 'train_X' and label 'train_y'

train_y,train_X=np.hsplit(df,[1])
print(train_y.head())
print(train_X.head())

   6
0  5
1  7
2  9
3  5
4  2
   0  0.1  0.2  0.3  0.4  0.5  ...  0.585  0.586  0.587  0.588  0.589  0.590
0  0    0    0    0    0    0  ...      0      0      0      0      0      0
1  0    0    0    0    0    0  ...      0      0      0      0      0      0
2  0    0    0    0    0    0  ...      0      0      0      0      0      0
3  0    0    0    0    0    0  ...      0      0      0      0      0      0
4  0    0    0    0    0    0  ...      0      0      0      0      0      0

[5 rows x 784 columns]


In [4]:
#normalize the feature set 'train_X' to get 'train_x'

train_x=train_X.copy()

#get the maximum datapoint in the entire feature set 'train_x'
maximum=train_x.max().max()  
print(maximum)

#divide the entire feature set 'train_x' with 'maximum'
train_x=train_x.div(maximum) 
train_x=train_x.astype('float64')
print (train_x.sample(5))

255
         0  0.1  0.2  0.3  0.4  0.5  ...  0.585  0.586  0.587  0.588  0.589  0.590
14886  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0
6879   0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0
3016   0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0
3399   0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0
5934   0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0

[5 rows x 784 columns]


In [5]:
#'train_x_row' = number of rows in the feature set 'train_x'
train_x_row=train_x.shape[0]
print(train_x_row)

#'train_x_col' = number of columns in the feature set 'train_x'
train_x_col=train_x.shape[1]
print(train_x_col)

19999
784


In [6]:
#check out how many unique classes are present in 'train_y'

train_y=train_y.iloc[:,0]
print(train_y.unique())

#'u' is an array containing all the unique class values 
u=train_y.unique()
clas=len(u)
print(clas)

[5 7 9 2 0 6 8 3 4 1]
10


In [7]:
#convert feature and label dataframes to numpy array

train_x=train_x.to_numpy()
train_y=train_y.to_numpy()

In [8]:
#enable one hot classification

one_hot_label=np.empty([train_x_row,clas])
for i in range(clas):
    for j in range(train_x_row):
        if(train_y[j]==u[i]):
            one_hot_label[j,i]=1
        else:
            one_hot_label[j,i]=0
print(one_hot_label[:10]) 

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


In [16]:
#define the class 'ann'

class ann:

  @staticmethod

  #define the hypothesis that calculates the expression (w0.x0+w1.x1+w2.x2+.....)

  def hypothesis(X,w,b):
    hypothesis_output=np.dot(X,w)+b
    return hypothesis_output

  #define the sigmoid function

  def sigmoid(x):
    sigmoid=(1/(1+np.exp(-x))).astype('float64')
    return sigmoid

  #define the derivative of sigmoid function

  def sigmoid_der(x):
    return (ann.sigmoid(x))*(1-(ann.sigmoid(x)))

  #define the softmax function

  def softmax(A):
    expA=np.exp(A)
    return expA/np.sum(expA)

  #define the training function

  def train(train_x,one_hot_label,wh,bh,wo,bo,alpha,epoch):

    for epoch in range(epoch):
    
      #feedforward phase
    
      #phase 1: feedforward from input layer to hidden layer
      zh=ann.hypothesis(train_x,wh,bh)
      ah=ann.sigmoid(zh)
    
      #phase 2: feedforward from hidden layer to output layer
      zo=ann.hypothesis(ah,wo,bo)
      ao=ann.softmax(zo)
    
      #backpropagation phase
    
      #phase 1: backpropagation from output layer to hidden layer
      #formula for gradient descent for attribute terms: dcost_dwo=(dcost_dzo)*(dzo_dwo)
      dcost_dzo=ao-one_hot_label
      dzo_dwo=ah
      dcost_dwo=np.dot(dzo_dwo.T,dcost_dzo)
      #formula for gradient descent for bias term: dcost_dbo=(dcost_dzo)*(dzo_dwo)  [dzo_dwo=1]
      dcost_dbo=dcost_dzo
    
      #phase 2: backpropagation from hidden layer to input layer
      #formula for gradient descent for attribute terms: dcost_dwh=(dcost_dzo)*(dzo_dah)*(dah_dzh)*(dzh_dwh)
      dzo_dah=wo
      dcost_dah=np.dot(dcost_dzo,dzo_dah.T)   #dcost_dah=(dcost_dzo)*(dzo_dah)
      dah_dzh=ann.sigmoid_der(zh)
      dzh_dwh=train_x
      dcost_dwh=np.dot(dzh_dwh.T,dah_dzh*dcost_dah)   #dcost_dwh=(dcost_dah)*(dah_dzh)*(dzh_dwh)
      #formula for gradient descent for bias term: dcost_dbh=(dcost_dah)*(dah_dzh)*(dzh_dbh)  [dzh_dbh=1]
      dcost_dbh=dcost_dah*dah_dzh
    
      #update weights for hidden layer
      wh-=lr*dcost_dwh
      bh-=lr*dcost_dbh.sum(axis=0)
      
      #update weights for output layer
      wo-=lr*dcost_dwo
      bo-=lr*dcost_dbo.sum(axis=0)

    return wh,bh,wo,bo


In [66]:
hidden_nodes=4
output_labels=clas

np.random.seed(100)

#wh and bh are weights and bias respectively, for hidden layer
wh=np.random.rand(train_x_col,hidden_nodes)
print(wh.shape)
bh=np.random.randn(hidden_nodes)
print(bh.shape)

#wo and bo are weights and bias respectively, for output layer
wo=np.random.rand(hidden_nodes,output_labels)
print(wo.shape)
bo=np.random.randn(output_labels)
print(bo.shape)

lr=0.0000001    #lr=learning rate
epoch=50

(784, 4)
(4,)
(4, 10)
(10,)


In [67]:
#call the training function

wh,bh,wo,bo=ann.train(train_x,one_hot_label,wh,bh,wo,bo,lr,epoch)

In [None]:
#print the weights & the biases

print(wh)
print(bh)
print(wo)
print(bo)

[[0.54340494 0.27836939 0.42451759 0.84477613]
 [0.00471886 0.12156912 0.67074908 0.82585276]
 [0.13670659 0.57509333 0.89132195 0.20920212]
 ...
 [0.10588135 0.57201569 0.83075592 0.49385501]
 [0.68247216 0.5340313  0.41530805 0.25293348]
 [0.28482729 0.16487699 0.18607189 0.22663423]]
[ 0.5944669  -1.14899632 -0.81917881  0.18340385]
[[0.51523673 0.70676402 0.06715192 0.41435613 0.0565391  0.8741224
  0.39667021 0.32834421 0.93788682 0.28829348]
 [0.78926249 0.72849646 0.77581433 0.80760698 0.51456583 0.2594579
  0.57921785 0.7476789  0.73870223 0.0667448 ]
 [0.57070807 0.61423745 0.21101899 0.7000436  0.76646654 0.78090019
  0.81683006 0.70616957 0.02143111 0.60420069]
 [0.75640088 0.23068058 0.03104413 0.1290917  0.88122974 0.52328119
  0.55079941 0.21391375 0.59181768 0.01948783]]
[ 0.36524567 -0.13411274 -1.24241509  0.18093653  2.78613215 -1.20988845
 -0.33197091  0.4648701   0.24804543 -0.10539259]


In [20]:
#load the test dataset

data=pd.read_csv('sample_data/mnist_test.csv')

In [21]:
#split 'data' into feature set 'test_X' and label 'test_y'

test_y,test_X=np.hsplit(data,[1])
print(test_y.head())
print(test_X.head())

   7
0  2
1  1
2  0
3  4
4  1
   0  0.1  0.2  0.3  0.4  0.5  ...  0.662  0.663  0.664  0.665  0.666  0.667
0  0    0    0    0    0    0  ...      0      0      0      0      0      0
1  0    0    0    0    0    0  ...      0      0      0      0      0      0
2  0    0    0    0    0    0  ...      0      0      0      0      0      0
3  0    0    0    0    0    0  ...      0      0      0      0      0      0
4  0    0    0    0    0    0  ...      0      0      0      0      0      0

[5 rows x 784 columns]


In [22]:
#normalize the feature set 'test_X' to get 'test_x'

test_x=test_X.copy()
test_maximum=test_x.max().max()
print(test_maximum)

test_x=test_x.div(test_maximum)
test_x=test_x.astype('float64')
print (test_x.sample(5))

255
        0  0.1  0.2  0.3  0.4  0.5  ...  0.662  0.663  0.664  0.665  0.666  0.667
7532  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0
357   0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0
9606  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0
7418  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0
5250  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0

[5 rows x 784 columns]


In [25]:
#'test_x_row' = number of rows in feature set 'test_x'
test_x_row=test_x.shape[0]
print(test_x_row)

#'test_x_col' = number of columns in feature set 'test_x'
test_x_col=test_x.shape[1]
print(test_x_col)

9999
784


In [23]:
#convert the dataframes into numpy arrays

test_y=test_y.to_numpy()
test_x=test_x.to_numpy()

In [69]:
#create one_hot_label for test dataset

test_one_hot_label=np.empty([test_x_row,clas])
for i in range(clas):
    for j in range(test_x_row):
        if(test_y[j]==u[i]):
            test_one_hot_label[j,i]=1
        else:
            test_one_hot_label[j,i]=0
print(test_one_hot_label[:10]) 

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


In [None]:
#predict outputs on test dataset

#phase 1
test_zh=ann.hypothesis(test_x,wh,bh)
test_ah=ann.sigmoid(test_zh)
    
#phase 2
test_zo=ann.hypothesis(test_ah,wo,bo)
test_ao=ann.softmax(test_zo)

print(test_ao)

[[3.91716675e-03 6.93755065e-38 1.56613945e-01 ... 1.00913585e-24
  2.02601127e-02 8.19205468e-01]
 [3.91716745e-03 6.93756835e-38 1.56613947e-01 ... 1.00913751e-24
  2.02601162e-02 8.19205462e-01]
 [3.91716675e-03 6.93755065e-38 1.56613945e-01 ... 1.00913585e-24
  2.02601127e-02 8.19205468e-01]
 ...
 [3.91716675e-03 6.93755065e-38 1.56613945e-01 ... 1.00913585e-24
  2.02601127e-02 8.19205468e-01]
 [3.91716675e-03 6.93755065e-38 1.56613945e-01 ... 1.00913585e-24
  2.02601127e-02 8.19205468e-01]
 [3.91716675e-03 6.93755065e-38 1.56613945e-01 ... 1.00913585e-24
  2.02601127e-02 8.19205468e-01]]


In [None]:
#calculate accuracy on test dataset

test_accuracy = 0
for col in range(0, 10):
    for row in range(test_x_row):
        if test_one_hot_label[row][col] == 1 and test_ao[row][col] >= 0.5:
            test_accuracy += 1
test_accuracy = test_accuracy/test_x_row
print(test_accuracy)

0.11351135113511351
