# Table of contents:

1. Data
2. Splitting the data
3. Functions
4. Train
5. Evaluate

### Data 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("mnist_train.csv")
data.head(10)

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
m, n = data.shape
print(m, n)

60000 785


There are 60,000 records and 785 columns... let's split the dataset into 45000 for training and 15000 for testing 

In [7]:
# We have to change pandas dataframe to numpy array
ds = np.array(data)
ds

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [6, 0, 0, ..., 0, 0, 0],
       [8, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Splitting the dataset

In [8]:
# Training dataset
data_train = ds[:45000].T
X_train = data_train[1:n]
Y_train = data_train[0]
# normalizing
X_train = X_train/255

# Testing dataset
data_test = ds[45000:m].T
X_test = data_test[1:n]
Y_test = data_test[0]
# normalizing 
X_test = X_test/255

In [71]:
# how many nodes do we want for per layer: 
n_nodes = 10

In [72]:
X_train.shape[0]

784

### Functions

In [63]:
def init_weights():
    w1 = np.random.rand(n_nodes, X_train.shape[0]) - 0.5
    b1 = np.random.rand(n_nodes, 1) - 0.5
    w2 = np.random.rand(n_nodes, n_nodes) - 0.5
    b2 = np.random.rand(n_nodes, 1) - 0.5
    
    return w1, b1, w2, b2

In [64]:
def ReLu(Z):
    return np.maximum(Z, 0)

def Softmax(Z):
    out = np.exp(Z)/sum(np.exp(Z))
    return out

In [65]:
def forward_prop(w1, b1, w2, b2, X):
    z1 = w1.dot(X) + b1
    a1 = ReLu(z1)
    z2 = w2.dot(a1) + b2
    a2 = Softmax(z2)
    
    return z1, a1, z2, a2

In [66]:
def ReLu_deriv(Z):
    return Z>0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    
    return one_hot_Y

In [67]:
def backprop(z1, a1, z2, a2, w1, w2, X, Y):
    one_hot_Y = one_hot(Y)
    
    dz2 = a2 - one_hot_Y
    dw2 = 1 / m * dz2.dot(a1.T)
    db2 = 1 / m * np.sum(dz2)
    
    dz1 = w2.T.dot(dz2) * ReLu_deriv(z1)
    dw1 = 1 / m * dz1.dot(X.T)
    db1 = 1 / m * np.sum(dz1)
    
    return dw1, db1, dw2, db2

In [68]:
def update_weights(dw1, db1, dw2, db2, w1, b1, w2, b2, alpha):
    w1 = w1 - alpha*dw1
    b1 = b1 - alpha*db1
    
    w2 = w2 - alpha*dw2
    b2 = b2 - alpha*db2
    
    return w1, b1, w2, b2

def get_predictions(a2):
    return np.argmax(a2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    
    return np.sum(predictions == Y) / Y.size

In [69]:
def gradient_descent(X, Y, alpha, interations):
    w1, b1, w2, b2 = init_weights()
    
    for i in range(interations):
        z1, a1, z2, a2 = forward_prop(w1, b1, w2, b2, X)
        dW1, db1, dW2, db2 = backprop(z1, a1, z2, a2, w1, w2, X, Y)
        w1, b1, w2, b2 = update_weights(dW1, db1, dW2, db2, w1, b1, w2, b2, alpha)
        
        predictions = get_predictions(a2)
        accuracy = get_accuracy(predictions, Y)
        
        print("iterations: ", i)
        print("Accuracy: ", accuracy)
        
    return w1, b1, w2, b2

### Training

In [75]:
w1, b1, w2, b2 = gradient_descent(X_train, Y_train, 0.05, 5000)

[3 3 7 ... 7 3 7] [5 0 4 ... 8 4 5]
iterations:  0
Accuracy:  0.11346666666666666
[3 3 7 ... 7 3 7] [5 0 4 ... 8 4 5]
iterations:  1
Accuracy:  0.11002222222222222
[3 3 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  2
Accuracy:  0.1087111111111111
[3 3 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  3
Accuracy:  0.10722222222222222
[3 3 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  4
Accuracy:  0.10804444444444444
[3 3 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  5
Accuracy:  0.10908888888888889
[3 3 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  6
Accuracy:  0.10984444444444444
[3 3 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  7
Accuracy:  0.11135555555555555
[3 3 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  8
Accuracy:  0.11291111111111111
[3 3 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  9
Accuracy:  0.11393333333333333
[3 8 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  10
Accuracy:  0.11533333333333333
[3 8 1 ... 7 3 0] [5 0 4 ... 8 4 5]
iterations:  11
Accuracy:  0.11702222222222222
[6 8 1 ... 7 3 

### Evaluate on the test dataset

In [76]:
z1_test, a1_test, z2_test, a2_test = forward_prop(w1, b1, w2, b2, X_test)
predictions_test = get_predictions(a2_test)
accuracy_test = get_accuracy(predictions_test, Y_test)
print("Test Accuracy: ", accuracy_test)

[3 1 1 ... 5 6 8] [3 1 1 ... 5 6 8]
Test Accuracy:  0.9042666666666667
