In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

In [2]:
def load_data(dataframe, label):
    X = dataframe.drop(label, axis=1)/255
    y = dataframe[label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
    X_train = np.array(X_train).T
    X_test = np.array(X_test).T
    y_train = np.array(y_train).T
    y_test = np.array(y_test).T
    return X_train, X_test, y_train, y_test

In [3]:
class NeuralNetwork:
    def __init__(self, X, y, epoch = 200, lr = 5*10**(-2)):
        self.input = X
        self.target = y
        self.epoch = epoch
        self.learning_rate = lr
        self.m = X.shape[1]
        
    def init_weights(self):
        np.random.seed(138)
        self.W1 = np.random.randn(64, self.input.shape[0]) * np.sqrt(1./self.input.shape[0])
        self.W2 = np.random.randn(10, self.W1.shape[0]) * np.sqrt(1./self.W1.shape[0])
        #self.W3 = np.random.randn((10, self.W2.shape[0]))
        
        self.b1 = np.zeros((64, 1)) * np.sqrt(1./self.input.shape[0])
        self.b2 = np.zeros((10, 1)) * np.sqrt(1./self.W1.shape[0])
        #self.b3 = np.zeros((10, 1))
    
    def sigmoid(self, Z):
        s = 1/(1 + np.exp(-Z))
        return s
    
    def softmax(self, Z):
        exponents = np.exp(Z - np.max(Z, axis=0,))
        return exponents/np.sum(exponents, axis=0)
    
    def onehot(self, Y):
        one_hot_Y = np.zeros((Y.shape[0], 10))
        one_hot_Y[np.arange(Y.shape[0]), Y] = 1
        one_hot_Y = one_hot_Y.T
        return one_hot_Y
        
    def forward_prop(self):
        self.one_hot_Y = self.onehot(self.target)
        
        assert self.W1.shape[1] == self.input.shape[0]
        self.Z1 = self.W1.dot(self.input) + self.b1
        self.A1 = self.sigmoid(self.Z1)
        
        assert self.W2.shape[1] == self.A1.shape[0]
        self.Z2 = self.W2.dot(self.A1) + self.b2
        self.A2 = self.softmax(self.Z2)
        
    def back_prop(self):
        self.dZ2 = self.A2 - self.one_hot_Y
        self.dW2 = (1./self.m) * np.matmul(self.dZ2, self.A1.T)
        self.db2 = (1./self.m) * np.sum(self.dZ2, axis=1, keepdims=True)
        
        self.dA1 = np.matmul(self.W2.T, self.dZ2)
        self.dZ1 = self.dA1 * self.sigmoid(self.Z1) * (1 - self.sigmoid(self.Z1))
        self.dW1 = (1./self.m) * np.matmul(self.dZ1, self.input.T)
        self.db1 = (1./self.m) * np.sum(self.dZ1, axis=1, keepdims=True)
        
        self.W2 = self.W2 - self.learning_rate * self.dW2
        self.b2 = self.b2 - self.learning_rate * self.db2
        
        self.W1 = self.W1 - self.learning_rate * self.dW1
        self.b1 = self.b1 - self.learning_rate * self.db1
        
    
    def cost(self):
        L_sum = np.sum(np.multiply(self.target, np.log(self.A2)))
        return -(1./self.m) * L_sum
    
    def train_model(self):
        self.init_weights()
        for i in np.arange(self.epoch):
            self.forward_prop()
            self.back_prop()
            if(i%20 == 0):
                print('training loss on ',i,'th iteration : ',self.cost())
                
    def predict(self, X, y):
        one_hot_Y = self.onehot(y)
        
        assert self.W1.shape[1] == X.shape[0]
        Z1 = self.W1.dot(X) + self.b1
        A1 = self.sigmoid(Z1)
        
        assert self.W2.shape[1] == A1.shape[0]
        Z2 = self.W2.dot(A1) + self.b2
        A2 = self.softmax(Z2)
        predictions = np.argmax(A2, axis=0)
        return predictions


In [4]:
df = pd.read_csv('../input/digit-recognizer/train.csv')
df

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X_train, X_test, y_train, y_test = load_data(df, 'label')

In [6]:
nn = NeuralNetwork(X_train, y_train)

In [7]:
nn.train_model()

training loss on  0 th iteration :  111.1230180143394
training loss on  20 th iteration :  102.93018796990356
training loss on  40 th iteration :  102.94640813298652
training loss on  60 th iteration :  103.112697923747
training loss on  80 th iteration :  103.36859709065529
training loss on  100 th iteration :  103.7323881061086
training loss on  120 th iteration :  104.22519177394656
training loss on  140 th iteration :  104.86839646111085
training loss on  160 th iteration :  105.6806689274895
training loss on  180 th iteration :  106.67503878066626


In [8]:
from sklearn.metrics import confusion_matrix, classification_report

In [9]:
predictions = nn.predict(X_test, y_test)
labels = y_test

print(confusion_matrix(predictions, labels))

[[389   0  18   4   4  34  13   0  17  10]
 [  0 470  61  43  24  41  15  42  87  21]
 [  0   0 271  15   1   3  14   3   3   3]
 [  5   0  12 403   0 104   1   0  65  13]
 [  1   0   6   2 228  14   0   6   2   9]
 [  1   0   0   2   0  67   1   0   1   0]
 [ 11   1  27   5  24  30 357   0  11   2]
 [  0   0  17  18  11  16   1 380  14 102]
 [  1   0   6   5   3  10   0   0 195   1]
 [  0   0   2   9 102  20   0   7   8 255]]


In [10]:
print(classification_report(predictions, labels))

              precision    recall  f1-score   support

           0       0.95      0.80      0.87       489
           1       1.00      0.58      0.74       804
           2       0.65      0.87      0.74       313
           3       0.80      0.67      0.73       603
           4       0.57      0.85      0.69       268
           5       0.20      0.93      0.33        72
           6       0.89      0.76      0.82       468
           7       0.87      0.68      0.76       559
           8       0.48      0.88      0.62       221
           9       0.61      0.63      0.62       403

    accuracy                           0.72      4200
   macro avg       0.70      0.77      0.69      4200
weighted avg       0.80      0.72      0.74      4200

