In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [13]:
train = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
train.info()

train = np.array(train)

test = train[0:1000]
train = train[1000:]

train = train.T
test = test.T

x_train = train[1:785]
y_train = train[0]

x_test = test[1:785]
y_test = test[0]

x_test = x_test / 255
x_train = x_train / 255

print(x_train.shape)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB
(784, 41000)


In [14]:
# W = weights
# B = bias
# A = prev layer output
# Z = layer * weights + bias, w * a + b

def inital_weights_and_bias():
    w1 = np.random.rand(10, 784) - 0.5
    w2 = np.random.rand(10, 10) - 0.5

    b1 = np.random.rand(10, 1) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    
    return w1, b1, w2, b2


def relu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    exp = np.exp(Z - np.max(Z)) 
    return exp / exp.sum(axis=0)    
    
def forward_prop(w1, b1, w2, b2, x):
    z1 = w1.dot(x) + b1
    a1 = relu(z1)
    
    z2 = w2.dot(a1) + b2
    a2 = softmax(z2)
    return z1, a1, z2, a2

def one_hot(Y):
    one_hot_y = np.zeros((Y.size, Y.max() + 1))
    one_hot_y[np.arange(Y.size), Y] = 1
    
    return one_hot_y.T

def relu_deriv(Z):
    return Z > 0

def back_prop(z1, a1, z2, a2, w2, X, Y):
    m = Y.size
    one_hot_y = one_hot(Y)
    dz2 = 2 * (a2 - one_hot_y)
    
    dw2 = 1 / m * dz2.dot(a1.T)
    db2 = 1 / m * np.sum(dz2, 1)
    
    dz1 = w2.T.dot(dz2) * relu_deriv(z1)
    
    dw1 = 1 / m * dz1.dot(X.T)
    db1 = 1 / m * np.sum(dz2, 1)
    
    return dw1, db1, dw2, db2
    
def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
    w1 -= alpha * dw1
    b1 -= alpha * np.reshape(db1, (10, 1))
    w2 -= alpha * dw2
    b2 -= alpha * np.reshape(db2, (10, 1))
    return w1, b1, w2, b2

def get_pred(A2):
    return np.argmax(A2, 0)

def get_acc(pred, y):
    return np.sum(pred == y) / y.size

In [15]:
def gradient_descent(x, y, iterations, alpha):
    w1, b1, w2, b2 = inital_weights_and_bias()
    
    for i in range(iterations):
        z1, a1, z2, a2 = forward_prop(w1, b1, w2, b2, x)
        dw1, db1, dw2, db2 = back_prop(z1, a1, z2, a2, w2, x, y)
        w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
                    
        if (i+1) % 50 == 0:
            print("iteration: ", i + 1)
            print("accuracy: ", get_acc(get_pred(a2), y))
                                       
    return w1, b1, w2, b2

In [16]:
w1, b1, w2, b2 = gradient_descent(x_train, y_train, 500, 0.1)

iteration:  50
accuracy:  0.6073658536585366
iteration:  100
accuracy:  0.7289756097560975
iteration:  150
accuracy:  0.7862682926829269
iteration:  200
accuracy:  0.8149512195121951
iteration:  250
accuracy:  0.8372926829268292
iteration:  300
accuracy:  0.8507317073170731
iteration:  350
accuracy:  0.8585853658536585
iteration:  400
accuracy:  0.8644146341463415
iteration:  450
accuracy:  0.8712439024390244
iteration:  500
accuracy:  0.8736341463414634


In [17]:
_, _, _, a2 = forward_prop(w1, b1, w2, b2, x_test)
pred = get_pred(a2)

acc = get_acc(pred, y_test)

print("Test Accuracy: ", acc)

Test Accuracy:  0.879
