In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def preprocess(dataset):
    Y = dataset['label'].values
    Y = np.eye(10)[Y] #one hot
    
    X = dataset.drop('label', axis=1).values
    X[X < 127] = 0
    X[X >= 127] = 1

    print(X.shape, Y.shape)
    return X, Y

In [3]:
train_dataset = pd.read_csv("mnist_train.csv")
X_train, Y_train = preprocess(train_dataset)

(60000, 784) (60000, 10)


In [4]:
test_dataset = pd.read_csv("mnist_test.csv")
X_test, Y_test = preprocess(test_dataset)

(10000, 784) (10000, 10)


In [5]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(X):
    exps = np.exp(X)
    return exps / np.sum(exps)

In [17]:
D_in = X_train.shape[1]
H1 = 65
H2 = 15
D_out = Y_train.shape[1]

η = 0.001
epochs = 40

In [18]:
W1 , W2 , W3 = np.random.randn(D_in, H1), np.random.randn(H1, H2), np.random.randn(H2, D_out)
B1 , B2 , B3 = np.random.randn(H1) , np.random.randn(H2) , np.random.randn(D_out)

In [19]:
def test():
    Y_hat = []
    for x, y in zip(X_test, Y_test):

        # forward
        x = x.reshape(-1, 1)
        net1 = x.T @ W1 + B1
        out1 = sigmoid(net1)

        net2 = out1 @ W2 + B2
        out2 = sigmoid(net2)

        net3 = out2 @ W3 + B3
        out3 = net3
        y_hat = softmax(out3)
        Y_hat.append(y_hat.T)

    acc = np.mean(np.argmax(y_hat) == np.argmax(y))

    Y_hat = np.array(Y_hat).reshape(-1, 10)
    loss = np.sum(-np.sum(np.multiply(Y_train, np.log10(Y_hat))))
    return acc

In [20]:
for epoch in range(epochs):
    Y_hat = []
    for x, y in zip(X_train, Y_train):

        # forward
        x = x.reshape(-1, 1)
        net1 = x.T @ W1 + B1
        out1 = sigmoid(net1)

        net2 = out1 @ W2 + B2
        out2 = sigmoid(net2)

        net3 = out2 @ W3 + B3
        out3 = net3
        y_hat = softmax(out3)

        Y_hat.append(y_hat.T)

        # back propagation
        
        error = -2 * (y - y_hat)
        grad_W3 = out2.T @ error
        grad_B3 = error

        error = error @ W3.T * out2 * (1 - out2)
        grad_W2 = out1.T @ error
        grad_B2 = error

        error = error @ W2.T * out1 * (1 - out1)
        grad_W1 = x @ error
        grad_B1 = error

        # update

        W1 = W1 - η * grad_W1
        W2 = W2 - η * grad_W2
        W3 = W3 - η * grad_W3

        B1 = B1 - η * grad_B1
        B2 = B2 - η * grad_B2
        B3 = B3 - η * grad_B3
    
    Y_hat = np.array(Y_hat).reshape(-1, 10)
    loss = np.sum(-np.sum(np.multiply(Y_train, np.log10(Y_hat))))
    print('loss:', loss, 'acc:', test())
    
print('train completed!')

loss: 43336.862285956224 acc: 0.635
loss: 24767.78224841616 acc: 0.7423
loss: 19535.30133986915 acc: 0.7892
loss: 16778.391219606332 acc: 0.8144
loss: 15026.073646438233 acc: 0.833
loss: 13785.65922308665 acc: 0.8434
loss: 12841.529705157518 acc: 0.8503
loss: 12088.596924921303 acc: 0.8569
loss: 11469.734321682881 acc: 0.8608
loss: 10949.11734684017 acc: 0.8661
loss: 10501.832914677298 acc: 0.8703
loss: 10111.187649131349 acc: 0.8734
loss: 9765.267583963338 acc: 0.8764
loss: 9454.822196364858 acc: 0.8783
loss: 9173.251661109536 acc: 0.88
loss: 8916.038673849183 acc: 0.8835
loss: 8679.768325656987 acc: 0.8877
loss: 8461.55969722198 acc: 0.89
loss: 8258.980164343642 acc: 0.892
loss: 8070.004056355115 acc: 0.8944
loss: 7892.962433265421 acc: 0.8967


KeyboardInterrupt: 