In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [4]:
#read data
data = pd.read_csv('./train.csv')
#data.head()

In [5]:
data = np.array(data)
m, n = data.shape
#print(m, n)
np.random.shuffle(data)

data_dev = data[0: 1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1: n]
X_dev = X_dev / 255 # normalizing

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1: n]
X_train = X_train / 255 # normalizing
#_,m_train = X_train.shape


In [6]:
#first column
#X_train[:,0].shape

In [7]:
#initialize parameters
def init_params():
    W_1 = np.random.rand(10, 784) - 0.5
    B_1 = np.random.rand(10, 1) - 0.5
    W_2 = np.random.rand(10, 10) - 0.5
    B_2 = np.random.rand(10, 1) - 0.5
    return W_1, B_1, W_2, B_2

def relu(Z):
    return np.maximum(Z, 0)

def deriv_relu(Z):
    return Z > 0

def softmax(Z):
    return np.exp(Z)/ sum(np.exp(Z)) #sum 

def forward_prop(W_1, B_1, W_2, B_2, X): 
    Z_1 = W_1.dot(X) + B_1
    A_1 = relu(Z_1)
    Z_2 = W_2.dot(A_1) + B_2
    A_2 = softmax(Z_2)
    return Z_1, A_1, Z_2, A_2

#for entire result matrix Y, initialize with one-hot function
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    print(Y.size, Y.max())
    one_hot_Y[np.arange(Y.size), Y] = 1
    return one_hot_Y.T

def back_prop(Z_1, A_1, Z_2, A_2, W_2, X, Y):
    #number of samples
    m = Y.size
    one_hot_Y = one_hot(Y)
    dZ_2 = A_2 - one_hot_Y
    dW_2 = 1 / m * dZ_2.dot(A_1.T)
    dB_2 = 1 / m * np.sum(dZ_2)
    dZ_1 = W_2.T.dot(dZ_2) * deriv_relu(Z_1)
    dW_1 = 1 / m * dZ_1.dot(X.T)
    dB_1 = 1 / m * np.sum(dZ_1)
    return dW_1, dB_1, dW_2, dB_2

def update_params(W_1, B_1, W_2, B_2, dW_1, dB_1, dW_2, dB_2, learning_rate):
    W_1 = W_1 - learning_rate * dW_1
    B_1 = B_1 - learning_rate * dB_1
    W_2 = W_2 - learning_rate * dW_2
    B_2 = B_2 - learning_rate * dB_2
    return W_1, B_1, W_2, B_2

In [8]:
def get_predictions(A_2):
    return np.argmax(A_2, 0)

def get_accuracy(predicted, expected):
    print(predicted, expected)
    return(np.sum(predicted == expected) / expected.size)


def gradient_descent(X, Y, iterations, learning_rate):
    W_1, B_1, W_2, B_2 = init_params()
    for i in range(iterations):
        Z_1, A_1, Z_2, A_2 = forward_prop(W_1, B_1, W_2, B_2, X)
        dW_1, dB_1, dW_2, dB_2 = back_prop(Z_1, A_1, Z_2, A_2, W_2, X, Y)
        W_1, B_1, W_2, B_2 = update_params(W_1, B_1, W_2, B_2, dW_1, dB_1, dW_2, dB_2, learning_rate)
        if i % 10 == 0:
            print("Iteration:", i)
            predicted = get_predictions(A_2)
            print("Accuracy:", get_accuracy(predicted, Y))
    return W_1, B_1, W_2, B_2

In [9]:
W_1, B_1, W_2, B_2 = gradient_descent(X_train, Y_train, 500, 0.1)

41000 9
Iteration: 0
[5 2 2 ... 5 2 5] [6 2 6 ... 7 5 8]
Accuracy: 0.06682926829268293
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
Iteration: 10
[1 2 2 ... 3 2 4] [6 2 6 ... 7 5 8]
Accuracy: 0.13707317073170733
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
Iteration: 20
[6 2 2 ... 3 2 4] [6 2 6 ... 7 5 8]
Accuracy: 0.1964878048780488
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
Iteration: 30
[6 2 2 ... 3 2 4] [6 2 6 ... 7 5 8]
Accuracy: 0.2405609756097561
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
Iteration: 40
[6 2 2 ... 3 1 4] [6 2 6 ... 7 5 8]
Accuracy: 0.28524390243902437
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
Iteration: 50
[6 2 6 ... 9 1 3] [6 2 6 ... 7 5 8]
Accuracy: 0.34817073170731705
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
41000 9
Iteration: 60
[6 2 6 ... 7 1 3] [6 