In [9]:
import numpy as np
import time

## Network architecture
NUM_INPUT = 784  # Number of input neurons
NUM_OUTPUT = 10  # Number of output neurons

## Hyperparameters
NUM_HIDDEN = 50
LEARNING_RATE = 0.05
BATCH_SIZE = 64
NUM_EPOCH = 40
N = 0

print("NUM_HIDDEN: ", NUM_HIDDEN)
print("LEARNING_RATE: ", LEARNING_RATE)
print("BATCH_SIZE: ", BATCH_SIZE)
print("NUM_EPOCH: ", NUM_EPOCH)


class ForwardPropagationRet:
    Z1 = 0
    H1 = 0
    Z2 = 0
    Y_hat = 0

class Delta:
    deltaW2 = 0
    deltab2 = 0
    deltaW1 = 0
    deltab1 = 0

# Load the images and labels from a specified dataset (train or test).
def loadData (which):
    images = np.load("./data/mnist_{}_images.npy".format(which))
    labels = np.load("./data/mnist_{}_labels.npy".format(which))
    return images, labels

## 1. Forward Propagation
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute the cross-entropy (CE) loss.
def fCE (X, Y, W1, b1, W2, b2):
    # print(X.shape)
    ## your code here
    X = np.array(X).reshape(NUM_INPUT,1)
    Z1 = np.dot(W1,X) + b1
    H1 = ReLU(Z1)
    Z2 = np.dot(W2,H1) + b2
    Y_hat = Softmax(Z2)

    loss = LOSS(BATCH_SIZE, Y, Y_hat)
    ret = ForwardPropagationRet()
    ret.Y_hat = Y_hat
    ret.H1 = H1
    ret.Z2 = Z2
    ret.Z1 = Z1

    return loss,ret

    

## 2. Backward Propagation
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute the gradient of fCE. 
def gradCE (X, Y, W1, b1, W2, b2, fpRes):
    ## your code here
    deltaW2 = 0
    deltab2 = 0
    deltaW1 = 0
    deltab1 = 0
    Y_hat = fpRes.Y_hat
    H1 = fpRes.H1
    Z1 = fpRes.Z1

    for i in range(BATCH_SIZE):
        deltaW2 = deltaW2 + (Y_hat[i] - Y[i]) * H1[i].T
        deltab2 = deltab2 + Y_hat[i] - Y[i]
        deltaW1 = deltaW1 + W2.T * (Y_hat[i] - Y[i]) * np.sign(Z1[i]) * X[i].T
        deltab1 = deltab1 + W2.T * (Y_hat[i] - Y[i]) * np.sign(Z1[i])

    #axis = 0 表示列， axis = 1 表示行
    delta = Delta()
    delta.deltaW2 = 1/BATCH_SIZE * deltaW2
    delta.deltaW1 = 1/BATCH_SIZE * deltaW1
    delta.deltab2 = 1/BATCH_SIZE * deltab2
    delta.deltab1 = 1/BATCH_SIZE * deltab1

    return delta
    

## 3. Parameter Update
# Given training and testing datasets, train the NN.
def train(trainX, trainY, testX, testY):
    #  Initialize weights randomly
    W1 = np.random.randn(392,784)
    W2 = np.random.randn(10,392)
    b1 = np.random.randn(392,1)
    b2 = np.random.rand(10,1)
    N = len(trainX)
    iter = int((N/BATCH_SIZE))
    

    for i in range(NUM_EPOCH):
        #每次epoch之前打乱数据集
        index = np.arange(N)
        np.random.shuffle(index)
        trainX = trainX[index]
        trainY = trainY[index]
        for j in range(iter+1):
            start = j * BATCH_SIZE
            X = trainX[start:start+BATCH_SIZE]
            Y = trainY[start:start+BATCH_SIZE]
            # X = np.reshape()
            loss,fpRes = fCE(X,Y,W1,b1,W2,b2)
            print("epoch:"+i+", batch:" + j + ",loss:"+ loss)
            delta = gradCE(X,Y,W1,b1,W2,b2,fpRes)
            
            # update parameters
            W2 = W2 - LEARNING_RATE * delta.deltaW2
            W1 = W1 - LEARNING_RATE * delta.deltaW1
            b1 = b1 - LEARNING_RATE * delta.deltab1
            b2 = b2 - LEARNING_RATE * delta.deltab2
        
        #在测试集上测试结果
        test_loss, test_fp_res = fCE(testX,testY,W1,b1,W2,b2)
        test_y_hat = test_fp_res.Y_hat
        test_predict_label = np.argmax(test_y_hat, axis=1)
        test_truth_label = np.argmax(testY, axis=1)
        test_accurate_count = 0

        for j in range(len(test_predict_label)):
            if test_predict_label[j] == test_truth_label[j]:
                test_accurate_count += 1

        test_accuracy = test_accurate_count / len(testY)
        print("epoch:"+i+", test_accuracy:" + test_accuracy + "---------------")
    ## your code here

    print("completed!")
    pass

def ReLU(X):
    return np.maximum(X,0)

def Softmax(X):
    size = len(X)
    Y_hat = np.zeros(len(X))
    total = 0
    for x in X:
        total += np.exp(x)

    for i,x in X:
        Y_hat[i] = np.exp(x)/total

    return Y_hat

def LOSS(n, Y, Y_hat):
    sum = 0
    for i in n:
        for k in NUM_OUTPUT:
            sum = sum + Y[k] * np.log(Y_hat[k])

    return -1/n * sum

if __name__ == "__main__":
    # Load data
    start_time = time.time()
    trainX, trainY = loadData("train")
    testX, testY = loadData("test")

    print("len(trainX): ", len(trainX))
    print("len(testX): ", len(testX))

    # # Train the network and report the accuracy on the training and test set.
    train(trainX, trainY, testX, testY)

NUM_HIDDEN:  50
LEARNING_RATE:  0.05
BATCH_SIZE:  64
NUM_EPOCH:  40
len(trainX):  10000
len(testX):  5000


ValueError: cannot reshape array of size 50176 into shape (784,1)