In [1]:
import urllib.request
import numpy as np
np.random.seed(0)
url = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz"
urllib.request.urlretrieve(url, "mnist.npz")

mnist = np.load("mnist.npz")

In [2]:
x_train = mnist['x_train']
x_test = mnist['x_test']
y_train = mnist['y_train']
y_test = mnist['y_test']

In [3]:
np.random.seed(0)

# layer_dims = [nx, h1, ... nL]
def initialize_parameters(layer_dims):
    params = {}
    for i in range(1, len(layer_dims)):
        params[f"W{i}"] = np.random.randn(layer_dims[i], layer_dims[i-1]) * np.sqrt(2./layer_dims[i-1])
        params[f"b{i}"] = np.zeros((layer_dims[i], 1))
    return params

In [4]:
np.random.seed(0)
# Activation functions
def relu(Z):
    return np.maximum(0,Z)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z,axis=0,keepdims=True))
    return expZ/np.sum(expZ,axis=0,keepdims=True)

In [5]:
np.random.seed(0)
# Forward pass
def forward_propagation(A_prev, W, b, func):
    Z = np.dot(W,A_prev) + b
    if func == "relu":
        A = relu(Z)
    if func == "softmax":
        A = softmax(Z)
    return A, Z

In [6]:
np.random.seed(0)
def lost(Y, Y_hat):
    m = Y.shape[1]
    loss = -1/m * np.sum(Y * np.log(Y_hat + 1e-8))
    return loss

In [7]:
np.random.seed(0)
# Backward pass
def backward_propagation(dZ,A_prev,W,Z_prev, m):
    dW = 1/m * np.dot(dZ,A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    if Z_prev is None:
        dZ_prev = None
    else:
        dZ_prev = np.dot(W.T, dZ) * (Z_prev>0)
    return dZ_prev, dW, db

In [8]:
np.random.seed(0)
def update_parameters(dW,db, alpha, W, b):
    W -= alpha*dW
    b -= alpha*db
    return

In [9]:
np.random.seed(0)
def train_model(n, X, Y, learning_rate, parameters):
    cache = {"A0":X}
    # n - 1 relu activations
    for i in range(1, n - 1):
        cache[f"A{i}"], cache[f"Z{i}"] = forward_propagation(cache[f"A{i-1}"], parameters[f"W{i}"], parameters[f"b{i}"], "relu")

    # 1 softmax activation (output layer)
    cache[f"A{n - 1}"], cache[f"Z{n - 1}"] = forward_propagation(cache[f"A{n - 2}"], parameters[f"W{n - 1}"], parameters[f"b{n - 1}"], "softmax")

    # dL/dZ = Y_hat - Y = A^[L] - Y
    dZ = cache[f"A{n-1}"] - Y
    
    grads = {}
    for i in range(n-1,0,-1):
        if i==1:
            Z_prev = None
        else:
            Z_prev = cache[f"Z{i-1}"]
        dZ, grads[f"dW{i}"], grads[f"db{i}"] = backward_propagation(dZ, cache[f"A{i-1}"], parameters[f"W{i}"], Z_prev, Y.shape[1])

    for i in range(1,n):
        update_parameters(grads[f"dW{i}"], grads[f"db{i}"], learning_rate, parameters[f"W{i}"], parameters[f"b{i}"])
    return cache[f"A{n-1}"]

In [10]:
np.random.seed(0)
def test_model(n, X, parameters):
    cache = {"A0":X}
    # n - 1 relu activations
    for i in range(1, n - 1):
        cache[f"A{i}"], cache[f"Z{i}"] = forward_propagation(cache[f"A{i-1}"], parameters[f"W{i}"], parameters[f"b{i}"], "relu")

    # 1 softmax activation (output layer)
    cache[f"A{n - 1}"], cache[f"Z{n - 1}"] = forward_propagation(cache[f"A{n - 2}"], parameters[f"W{n - 1}"], parameters[f"b{n - 1}"], "softmax")
    return cache[f"A{n-1}"]

In [11]:
def get_accuracy(Y_hat, Y):
    predictions = np.argmax(Y_hat, axis=0)
    # Get true labels
    labels = np.argmax(Y, axis=0)
    # Calculate percentage
    accuracy = np.mean(predictions == labels) * 100
    return accuracy

In [12]:
np.random.seed(0)
def nn_engine(x_train, y_train, x_test, y_test, iterations, learning_rate, layer_dims):
    parameters = initialize_parameters(layer_dims)
    
    # Normalization
    x_train = x_train / 255.
    x_test = x_test / 255.
        
    X = x_train.reshape(x_train.shape[0], -1).T
    Y = np.zeros((layer_dims[len(layer_dims) - 1],y_train.size))
    Y[y_train.flatten(), np.arange(y_train.size)] = 1

    Y_test = np.zeros((layer_dims[len(layer_dims)-1],y_test.size))
    Y_test[y_test.flatten(),np.arange(y_test.size)]=1
    X_test = x_test.reshape(x_test.shape[0],-1).T

    m = X.shape[1]
    batch_size = 32
    
    for i in range(iterations):
        permutation = np.random.permutation(m)
        X_shuffled = X[:, permutation]
        Y_shuffled = Y[:, permutation]
        for j in range(0, m, batch_size):
            end = min(j + batch_size, m)
            x_batch = X_shuffled[:, j : end]
            y_batch = Y_shuffled[:, j : end]
            
            Y_hat = train_model(len(layer_dims), x_batch, y_batch, learning_rate, parameters)
        
        # print(f"{i}: loss = {lost(y_batch, Y_hat)}; accuracy = {get_accuracy(Y_hat,y_batch)}")
    # Just add X_test and Y_test to test the model.
    result = test_model(len(layer_dims),X_test,parameters)
    print(f"Test: loss = {lost(Y_test, result)}; accuracy = {get_accuracy(result, Y_test)}")
    return parameters

In [13]:
np.random.seed(0)
model_params = nn_engine(x_train, y_train, x_test, y_test, 20, 0.01, [784, 512, 256, 128, 10])

Test: loss = 0.0769989685406087; accuracy = 97.86


In [14]:
# Debugging.

# above 0.05
# above 0.1 [784, 392, 10] 0.2 got 90%
# changing layer = [784, 500, 10] 0.2
# above 0.2
# above 0.4
# above 0.8 had 93%
# above 1.6 (it had 94.98 test accuracy)
# tried 3.2 saw ossicalation for 13 - 22 range this is too high; need lower than 3.2 cause 12% accuracy
# 1.6 < a < 3.2; tring 2.4 32-19-33-27 need to go lower; acccuray = 30%
# 1.6 < a < 2.4 tring 2 (16-49-35-34-34-35) lower than 2 needed. accuracy = 35%
# 1.6 < a < 2; tring 1.8 got 89% may be closer to 1.6 is better
# 1.6 < a < 1.8; tring 1.7 (94.39) 1.6 was better 
# maybe 0.8 < a < 1.6; tring 1.2 (94.87) tring 1.4 (95.12) tring 1.5 ()

# using batches:
# nn_engine(x_train, y_train, x_test, y_test, 20, 0.01, [784, 256, 128, 64, 32, 16, 10]) -> Test: loss = 0.09361683529738042; accuracy = 97.6
# nn_engine(x_train, y_train, x_test, y_test, 20, 0.01, [784, 512, 256, 128, 10]) -> Test: loss = 0.0769989685406087; accuracy = 97.86