In [25]:
import gzip
import sys
import time
import numpy as np
from sklearn.utils import shuffle

In [26]:
def get_data(sample_size, pathX, pathY):
    f = gzip.open(pathX, 'r')

    # read off unimportant bytes describing file protocol
    image_size = 28 * 28
    protocol_length = 16
    f.read(protocol_length)

    X = f.read(image_size * sample_size)
    X = np.frombuffer(X, dtype=np.uint8).astype(np.float32)
    X = X.reshape(sample_size, image_size)

    f = gzip.open(pathY, 'r')

    protocol_length = 8
    f.read(protocol_length)
    Y_temp = f.read(sample_size)
    Y_temp = np.frombuffer(Y_temp, dtype=np.uint8)

    Y = np.zeros([sample_size, 10], dtype='f')
    for sample in range(sample_size):
        Y[sample][Y_temp[sample]] = 1.0

    return [np.array([data / 127.5 for data in X], dtype='f') - 1.0, Y]

In [27]:
class Network:
    def __init__(self, layers):
        self.layers = layers
        self.w = []
        self.b = []
        for layer in range(1, len(layers)):
            self.w.append((2.0 * (np.random.randint(1e9, size=[layers[layer], layers[layer - 1]]) / 1e9)) - 1.0)
            self.b.append(((2.0 * (np.random.randint(1e9, size=[layers[layer]]) / 1e9)) - 1.0).reshape([layers[layer], 1]))

    def sigmoid(self, data):
        epsilon = 1e-8
        return 1.0 / (1.0 + np.exp(-data + epsilon))

    def sigmoid_derivative(self, data):
        temp = self.sigmoid(data)
        return temp * (1 - temp)

    def tanh(self, data):
        return np.tanh(data)
    
    def tanh_derivative(self, data):
        return 1 - np.tanh(data) ** 2

    def activation(self, data):
        return self.tanh(data)

    def activation_derivative(self, data):
        return self.tanh_derivative(data)

    def backPropFast(self, train_X, train_Y, eta, lambd):
        m = len(train_X[0])

        gradients_w = [np.zeros([self.layers[layer], self.layers[layer - 1]], dtype='f') for layer in range(1, len(self.layers))]
        gradients_b = [np.zeros([self.layers[layer]], dtype='f') for layer in range(1, len(self.layers))]
        activations = [train_X]
        zs = [train_X]

        last_layer = len(self.layers) - 2

        for layer in range(len(self.layers) - 1):
            zs.append(np.dot(self.w[layer], activations[-1]) + self.b[layer])
            if layer == last_layer: activations.append(self.sigmoid(zs[-1]))
            else: activations.append(self.activation(zs[-1]))

        delta_l = (activations[-1] - train_Y) * self.sigmoid_derivative(zs[-1])
        gradients_b[-1] = delta_l.sum(axis=1)
        gradients_w[-1] = np.dot(delta_l, np.transpose(activations[-2])) + lambd * self.w[-1] / float(m)

        for layer in range(len(self.layers) - 2, 0, -1):
            delta_l = np.dot(np.transpose(self.w[layer]), delta_l) * self.activation_derivative(zs[layer])
            gradients_b[layer - 1] += delta_l.sum(axis=1)
            gradients_w[layer - 1] += np.dot(delta_l, np.transpose(activations[layer - 1])) + lambd * self.w[layer - 1] / float(m)
        
        for layer in range(len(gradients_b)):
            self.w[layer] -= eta * gradients_w[layer] / float(m)
            self.b[layer] -= eta * gradients_b[layer].reshape([len(gradients_b[layer]), 1]) / float(m)

    def train(self, train_X, train_Y, eta, batch_size, lambd=1.0):
        m = len(train_X[0]) #60000
        for batch in range(int(m / batch_size)):
            X = train_X[ : , batch * batch_size : (batch + 1) * batch_size]
            Y = train_Y[ : , batch * batch_size : (batch + 1) * batch_size]
            self.backPropFast(X, Y, eta, lambd)

    def getAccuracy(self, X, Y):
        last_layer = len(self.layers) - 2
        for layer in range(len(self.layers) - 1):
            if layer == last_layer: X = self.sigmoid(np.dot(self.w[layer], X) + self.b[layer].reshape([self.layers[layer + 1], 1]))
            else: X = self.activation(np.dot(self.w[layer], X) + self.b[layer].reshape([self.layers[layer + 1], 1]))
        predictions = np.argmax(X, 0)
        Y = np.argmax(Y, 0)
        return (predictions == Y).sum() / float(len(Y))

In [28]:
train_size = 1000
test_size = 10000

train_X, train_Y = get_data(train_size, 'MNIST_Dataset/train-images-idx3-ubyte.gz', 'MNIST_Dataset/train-labels-idx1-ubyte.gz')
test_X, test_Y = [np.transpose(a) for a in get_data(test_size, 'MNIST_Dataset/t10k-images-idx3-ubyte.gz', 'MNIST_Dataset/t10k-labels-idx1-ubyte.gz')]

In [53]:
layers = [784, 256, 64, 10]
batch_size = 100
eta = 0.5
epochs = 100
lambd = 25.0

network = Network(layers)

In [54]:
for epoch in range(epochs):
    print("\nEpoch:", epoch + 1)
    train_X, train_Y = shuffle(train_X, train_Y)
    start_time = time.perf_counter()
    network.train(np.transpose(train_X), np.transpose(train_Y), eta, batch_size, lambd)
    print(f"Train Accuracy = {network.getAccuracy(np.transpose(train_X), np.transpose(train_Y)):0.3f}")
    print(f"Test Accuracy = {network.getAccuracy(test_X, test_Y):0.3f}")
    # print(f"Time taken in this epoch = {time.perf_counter() - start_time:0.1f} seconds")

# print("average weight:", sum([np.sum(np.abs(network.w[layer])) for layer in range(len(layers) - 1)]) / sum([np.size(network.w[layer]) for layer in range(len(layers) - 1)]))


Epoch: 1
Train Accuracy = 0.220
Test Accuracy = 0.157

Epoch: 2
Train Accuracy = 0.295
Test Accuracy = 0.215

Epoch: 3
Train Accuracy = 0.416
Test Accuracy = 0.296

Epoch: 4
Train Accuracy = 0.540
Test Accuracy = 0.441

Epoch: 5
Train Accuracy = 0.626
Test Accuracy = 0.534

Epoch: 6
Train Accuracy = 0.698
Test Accuracy = 0.595

Epoch: 7
Train Accuracy = 0.686
Test Accuracy = 0.626

Epoch: 8
Train Accuracy = 0.669
Test Accuracy = 0.606

Epoch: 9
Train Accuracy = 0.716
Test Accuracy = 0.654

Epoch: 10
Train Accuracy = 0.671
Test Accuracy = 0.644

Epoch: 11
Train Accuracy = 0.657
Test Accuracy = 0.596

Epoch: 12
Train Accuracy = 0.694
Test Accuracy = 0.644

Epoch: 13
Train Accuracy = 0.549
Test Accuracy = 0.490

Epoch: 14
Train Accuracy = 0.582
Test Accuracy = 0.545

Epoch: 15
Train Accuracy = 0.520
Test Accuracy = 0.461

Epoch: 16
Train Accuracy = 0.502
Test Accuracy = 0.465

Epoch: 17
Train Accuracy = 0.438
Test Accuracy = 0.405

Epoch: 18
Train Accuracy = 0.485
Test Accuracy = 0.446

