In [22]:
"""
Author: Shijie Wang
ID: 2016010539
"""
import numpy as np
import time

## Network architecture
NUM_INPUT = 784  # Number of input neurons
NUM_OUTPUT = 10  # Number of output neurons
NUM_CHECK = 5  # Number of examples on which to check the gradient

## Hyperparameters
NUM_HIDDEN = 50
LEARNING_RATE = 0.04
BATCH_SIZE = 128
NUM_EPOCH = 80

print("NUM_HIDDEN: ", NUM_HIDDEN)
print("LEARNING_RATE: ", LEARNING_RATE)
print("BATCH_SIZE: ", BATCH_SIZE)
print("NUM_EPOCH: ", NUM_EPOCH)


NUM_HIDDEN:  50
LEARNING_RATE:  0.04
BATCH_SIZE:  128
NUM_EPOCH:  80


In [23]:
# Given a vector w containing all the weights and biased vectors, extract
# and return the individual weights and biases W1, b1, W2, b2.
def unpack (w):
    W1 = np.reshape(w[:NUM_INPUT * NUM_HIDDEN],(NUM_INPUT,NUM_HIDDEN))
    w = w[NUM_INPUT * NUM_HIDDEN:]
    b1 = np.reshape(w[:NUM_HIDDEN], NUM_HIDDEN)
    w = w[NUM_HIDDEN:]
    W2 = np.reshape(w[:NUM_HIDDEN*NUM_OUTPUT], (NUM_HIDDEN,NUM_OUTPUT))
    w = w[NUM_HIDDEN*NUM_OUTPUT:]
    b2 = np.reshape(w,NUM_OUTPUT)
    return W1, b1, W2, b2

# Given individual weights and biases W1, b1, W2, b2, concatenate them and
# return a vector w containing all of them.
def pack (W1, b1, W2, b2):
    W1_ = np.reshape(W1,NUM_INPUT*NUM_HIDDEN)
    # print(W1_.shape)
    W2_ = np.reshape(W2,NUM_HIDDEN*NUM_OUTPUT)
    # print(W2_.shape)
    w = np.concatenate((W1_,b1, W2_, b2))
    # print(w.shape)
    return w

# Load the images and labels from a specified dataset (train or test).
def loadData (which):
    images = np.load("./data/mnist_{}_images.npy".format(which))
    labels = np.load("./data/mnist_{}_labels.npy".format(which))
    return images, labels

def ReLU(z):
    mask = z>0
    a = z * mask
    return a

def Softmax(z):
    e = np.exp(z)
    s = np.sum(e,1).reshape(-1,1)
    return e/s

def CELoss(y_out,y):
    return -(np.sum(y*np.log(y_out)))/BATCH_SIZE

def sgn(x):
    return (x>0)+0

def shuffleDataset(x,y):
    permutation = np.random.permutation(y.shape[0])
    shuffled_x = x[permutation, :]
    shuffled_y = y[permutation, :]
    return shuffled_x, shuffled_y    

In [24]:
## 1. Forward Propagation
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the cross-entropy (CE) loss.

def fCE (X, Y, w):
    # print(X.shape)
    ## your code here
    W1, b1, W2, b2 = unpack(w)
    loss = 0.0
    ## W1(784,50) b1(50) W2(50,10) b2(10)
    ## X(N,784) Y(N,10)
    z1 = np.matmul(X,W1) + b1   #z1(N,50)
    h1 = ReLU(z1)               #h1(N,50)
    z2 = np.matmul(h1,W2) + b2  #z2(N,10)
    a2 = Softmax(z2)
    loss = CELoss(a2,Y)
    y_pred = np.argmax(a2,1).reshape(-1)
    y_act = np.argmax(Y,1).reshape(-1)
    acc = np.sum(y_pred==y_act)/len(y_pred)
    return loss,z1,h1,z2,a2,acc

## 2. Backward Propagation
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the gradient of fCE. 
def gradCE (X, Y, w, z1, h1, z2, a2):
    W1, b1, W2, b2 = unpack(w)
    ## your code here
    delta_W_2 = (np.matmul(h1.T,(a2-Y)))/BATCH_SIZE     #(50,10)
    delta_b_2 = np.mean(a2-Y,0)                         #(10)
    delta_W_1 = (np.matmul(X.T,np.matmul((a2-Y),W2.T)*sgn(z1)))/BATCH_SIZE      #(784,50)
    delta_b_1 = np.mean((np.matmul((a2-Y),W2.T)*sgn(z1)),0)                     #(50)
    delta = pack(delta_W_1, delta_b_1, delta_W_2, delta_b_2)
    return delta

## 3. Parameter Update
# Given training and testing datasets and an initial set of weights/biases,
# train the NN.
def train(trainX, trainY, testX, testY, w):
    ## your code here
    for epoch in range(NUM_EPOCH):
        train_loss = 0
        train_acc = 0
        shuffled_trainX, shuffled_trainY = shuffleDataset(trainX, trainY)
        splits_x = np.array_split(shuffled_trainX, shuffled_trainX.shape[0] / BATCH_SIZE)
        splits_y = np.array_split(shuffled_trainY, shuffled_trainY.shape[0] / BATCH_SIZE)
        for x, y in zip(splits_x, splits_y):
            train_loss,z1,h1,z2,a2,train_acc = fCE(x, y, w)
            delta = gradCE(x, y, w, z1, h1, z2, a2)
            w -= LEARNING_RATE*delta
        print("Epoch {}: ".format(epoch + 1))
        print("train loss: {:.3f}, train accuracy: {:.3f}".format(train_loss, train_acc))
        test_loss, _, _, _, _, test_acc = fCE(testX, testY, w)
        print("test loss: {:.3f}, test accuracy: {:.3f}\n".format(test_loss, test_acc))
    pass


In [25]:
if __name__ == "__main__":
    # Load data
    start_time = time.time()
    trainX, trainY = loadData("train")
    testX, testY = loadData("test")

    print("shape(trainX): ", trainX.shape)
    print("shape(testX): ", testX.shape)
    print("shape(trainY): ", trainY.shape)
    print("shape(testY): ", testY.shape)

    # Initialize weights randomly
    W1 = 2*(np.random.random(size=(NUM_INPUT, NUM_HIDDEN))/NUM_INPUT**0.5) - 1./NUM_INPUT**0.5
    b1 = 0.01 * np.ones(NUM_HIDDEN)
    W2 = 2*(np.random.random(size=(NUM_HIDDEN, NUM_OUTPUT))/NUM_HIDDEN**0.5) - 1./NUM_HIDDEN**0.5
    b2 = 0.01 * np.ones(NUM_OUTPUT)

    w = pack(W1, b1, W2, b2)
    print("Shape of w:",w.shape)
    
    # # Train the network and report the accuracy on the training and test set.
    train(trainX, trainY, testX, testY, w)

shape(trainX):  (10000, 784)
shape(testX):  (5000, 784)
shape(trainY):  (10000, 10)
shape(testY):  (5000, 10)
Shape of w: (39760,)
Epoch 1: 
train loss: 1.417, train accuracy: 0.797
test loss: 57.339, test accuracy: 0.751

Epoch 2: 
train loss: 0.865, train accuracy: 0.805
test loss: 32.869, test accuracy: 0.817

Epoch 3: 
train loss: 0.619, train accuracy: 0.859
test loss: 24.437, test accuracy: 0.846

Epoch 4: 
train loss: 0.513, train accuracy: 0.875
test loss: 20.502, test accuracy: 0.866

Epoch 5: 
train loss: 0.461, train accuracy: 0.875
test loss: 18.326, test accuracy: 0.874

Epoch 6: 
train loss: 0.409, train accuracy: 0.875
test loss: 16.958, test accuracy: 0.884

Epoch 7: 
train loss: 0.305, train accuracy: 0.906
test loss: 15.883, test accuracy: 0.890

Epoch 8: 
train loss: 0.378, train accuracy: 0.891
test loss: 15.190, test accuracy: 0.893

Epoch 9: 
train loss: 0.295, train accuracy: 0.906
test loss: 14.671, test accuracy: 0.896

Epoch 10: 
train loss: 0.320, train accur

#### This is the hyperparameters I choose:

NUM_HIDDEN = 50

LEARNING_RATE = 0.04

BATCH_SIZE = 128

NUM_EPOCH = 80


#### In epoch 80, I get:

train loss: 0.081, train accuracy: 0.977

test loss: 8.617, test accuracy: 0.937