In [20]:
import numpy as np
from neural_network import MLP
from typing import Sequence, List, Optional

In [69]:
from keras.datasets import mnist

(train_X, train_y), (test_X, test_y) = mnist.load_data()

print(f'X_train: {train_X.shape}')
print(f'Y_train: {train_y.shape}')
print(f'X_test:  {test_X.shape}')
print(f'Y_test:  {test_y.shape}')


X_train: (60000, 28, 28)
Y_train: (60000,)
X_test:  (10000, 28, 28)
Y_test:  (10000,)


In [63]:
def logistic(x: np.ndarray) -> np.ndarray: # (B, C) -> (B -> C)
    """
    Applies the logistic function to an batched sums
    :param x: an input array
    :return: An array of the same size
    """
    return 1.0 / (1.0 + np.exp(-1 * x))


def logistic_prime(x: np.ndarray) -> np.ndarray: # (B, C) -> (B ->C)
    """
    Applies the derivative of the logistic function to batched sums
    :param x: an input array
    :return: An array of the same size
    """
    normal_logistic = 1.0 / (1.0 + np.exp(-1 * x))  # efficiency
    return normal_logistic * (1 - normal_logistic)


class MLP:
    def __init__(self, layer_sizes: Sequence[int]):
        self.layer_sizes = layer_sizes
        self.biases = [np.random.randn(y) for y in layer_sizes[1:]]
        # x is the size of the output from the previous layer, y is the size of the layer the weight outputs are going to
        self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])]

    def feedforward(self, x: np.ndarray) -> np.ndarray: # (B, input_layer_size) -> (B, output_layer_size)
        """
        Performs a feedforward run of the network and returns the output
        :param x: An array of equal size to the first layer in the MLP, with shape (B, I)
        :return: An array of equal size to the last layer in the MLP, with shape (B, O)
        """
        activation = x # (B, I)
        for weight, bias in zip(self.weights, self.biases):
            activation = logistic((activation @ weight.T) + bias) # (B, previous layer size) -> (B, current layer size)
        return activation

    def sgd(self, train_x: List[np.ndarray], train_y: List[np.ndarray], test_x: List[np.ndarray],
            test_y: List[np.ndarray], epochs: int, batch_size: int, learning_rate: float = 0.2) -> None:
        """
        Performs stochastic gradient descent using training data, while
        :param train_x: Training inputs
        :param train_y: Training target
        :param test_x: Testing inputs
        :param test_y: Testing targets
        :param epochs: The number of training runs to perform
        :param batch_size: Size of each training batch before update
        :param learning_rate: Learning rate to use when updating the model's parameters
        """
        for i in range(epochs):
            # training
            batches = ((train_x[j: j + batch_size, :], train_y[j: j + batch_size, :])
                       for j in range(0, len(train_x), batch_size))
            for x, y in batches:
                delta_weights, delta_biases = self.backprop(x, y)
                # performing the updates
                self.weights = [weight - (learning_rate / batch_size) * weight_update.sum(0)
                                for weight, weight_update in zip(self.weights, delta_weights)]
                self.biases = [bias - (learning_rate / batch_size) * bias_update.sum(0)
                               for bias, bias_update in zip(self.biases, delta_biases)]
            # running tests to track performance based on accuracy
            prediction = self.feedforward(test_x) # (B, o), for predictions and y
            correct = np.sum(np.argmax(prediction, 1) == np.argmax(test_y, 1))
            print(f"Accuracy for epoch {i}: {correct / len(test_x)}")

    def backprop(self, x: np.ndarray, y: np.ndarray) -> (List[np.ndarray], List[np.ndarray]):
        """
        Runs backprop for a single example through the network
        :param x: An input with shape (B, I), where I is the size of the first layer
        :param y: A target output of the input with shape (B, O), where O is the size of the last layer
        :return: The weight and bias rates of change with respect to the cost averaged over the full batch
        """
        activations = [x] # list[(B, I)] to start each entry will be (B, l), where l is the size of the current layer
        zs = []
        delta_weights = [] # list of (B, l, l-1)
        delta_biases = [] # list of (B, l)
        # feedforward to collect activations as the neuron sums (zs)
        for weight, bias in zip(self.weights, self.biases): # weight (l, l-1), bias (l) where l is the size of the current layer
            # for the first layer I = l-1, so for each layer we do (B, l-1) @ (l, l-1).T = (B, l)
            zs.append(activations[-1] @ weight.T + bias)
            activations.append(logistic(zs[-1]))
        # backprop step
        delta_a = activations[-1] - y  # taking the derivative of the cost for each activation
        delta_z = delta_a * logistic_prime(activations[-1])  # back propagating from the last output to the last sum
        delta_biases.append(delta_z)  # the differentials for biases are equal to the delta for the sum
        # for weights we need (B, L, 1) (B, 1, L-1) @ (B, L, L-1)
        delta_weights.append(delta_z[:, :, np.newaxis] @ activations[-2][:, np.newaxis, :])  # calculating differentials for weights
        # continuing to backpropagation through each layer
        for i in range(2, len(self.weights) + 1):
            delta_a = delta_z @ self.weights[-i + 1] # (B, l + 1) @ (l+1, l).T = (B, l)
            delta_z = delta_a * logistic_prime(activations[-i])  # backpropagation from the activation to the sum
            delta_biases.append(delta_z)
            delta_weights.append(delta_z[:, :, np.newaxis] @ activations[-i - 1][:, np.newaxis, :]) # (B, l, 1) @ (B, 1, l-1) = (B, l, l-1)
        delta_weights.reverse()
        delta_biases.reverse()
        return delta_weights, delta_biases


In [66]:
my_nn = MLP([28*28, 30, 30, 10])

In [24]:
train_x_flattened = train_X.reshape(-1, 28*28)
test_x_flattened = test_X.reshape(-1, 28*28)

In [25]:
train_y_vectorized = np.eye(10)[train_y]
test_y_vectorized = np.eye(10)[test_y]

In [67]:
my_nn.sgd(train_x_flattened[:10000], train_y_vectorized[:10000], test_x_flattened, test_y_vectorized, epochs=20, batch_size=100)

  import sys


Accuracy for epoch 0: 0.347
Accuracy for epoch 1: 0.424
Accuracy for epoch 2: 0.4757
Accuracy for epoch 3: 0.4714
Accuracy for epoch 4: 0.518
Accuracy for epoch 5: 0.5476
Accuracy for epoch 6: 0.553
Accuracy for epoch 7: 0.5791
Accuracy for epoch 8: 0.5952
Accuracy for epoch 9: 0.6019
Accuracy for epoch 10: 0.6045
Accuracy for epoch 11: 0.6142
Accuracy for epoch 12: 0.6201
Accuracy for epoch 13: 0.6377
Accuracy for epoch 14: 0.6407
Accuracy for epoch 15: 0.6545
Accuracy for epoch 16: 0.6564
Accuracy for epoch 17: 0.6625
Accuracy for epoch 18: 0.6581
Accuracy for epoch 19: 0.6617
