### Batch Gradient Descent:
A method for increasing the speed of training and rate of convergence.

The following neural network implements both batch gradient descent and dropout regularisation.

In [23]:
import numpy as np
import keras

# === Preparing the datasets ===
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
# Flatten the training images to an array of 784 pixels. Take 1000 sample datapoints
images, labels = (x_train[0 : 1000].reshape(1000, 28 * 28) / 255, y_train[0 : 1000])
one_hot_labels = np.zeros((len(labels), 10))
for i, label in enumerate(labels):
    one_hot_labels[i][label] = 1
labels = one_hot_labels
test_images = x_test.reshape(len(x_test), 28 * 28) / 255
test_labels = np.zeros((len(y_test), 10))
for i, label in enumerate(y_test):
    test_labels[i][label] = 1

# Relu functions
def relu(x):
    return (x > 0) * x

def reluDerivative(x):
    return (x > 0)

alpha, iterations = (0.001, 300)
pixels_per_image, num_labels, hidden_size = (784, 10, 100)

# Initialise weights (scaled from random weights in [0, 1] to random weights in [-0.1, 0.1])
np.random.seed(1)
weights_0_1 = 0.2 * np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, num_labels)) - 0.1

# Batch size of 100
batch_size = 100
for i in range(iterations):
    train_error, correct_count = (0.0, 0)
    for j in range(int(len(images) / batch_size)):
        # Getting the left and right slice bounds for the current batch
        batch_start = j * batch_size
        batch_end = (j + 1) * batch_size
        
        layer_0 = images[batch_start : batch_end]
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        layer_1_dropout_mask = np.random.randint(2, size=layer_1.shape)
        layer_1 *= layer_1_dropout_mask
        layer_2 = np.dot(layer_1, weights_1_2)
        
        train_error += np.sum((layer_2 - labels[batch_start : batch_end]) ** 2)
        for k in range(batch_size):
            correct_count += int(np.argmax(layer_2[k : k + 1]) == np.argmax(labels[batch_start + k : batch_start + k + 1]))
            layer_2_delta = (labels[batch_start : batch_end] - layer_2) / batch_size
            layer_1_delta = np.dot(layer_2_delta, weights_1_2.T) * reluDerivative(layer_1)
            layer_1_delta *= layer_1_dropout_mask
            weights_1_2 += alpha * np.dot(layer_1.T, layer_2_delta)
            weights_0_1 += alpha * np.dot(layer_0.T, layer_1_delta)
    if i % 10 == 0:
        test_error, test_correct_count = (0.0, 0)
        for m in range(len(test_images)):
            layer_0 = test_images[m : m + 1]
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)
            test_correct_count += int(np.argmax(layer_2) == np.argmax(test_labels[m : m + 1]))
            test_error += np.sum((layer_2 - test_labels[m : m + 1]) ** 2)
        print("\r"+ \
              " Iteration: {}".format(i) + \
              " | Train Error: " + str(train_error / float(len(images))) + \
              " | Train Acc: " + str(correct_count / float(len(images))) + \
              " | Test Error: " + str(test_error / float(len(test_images)))[0:5] + \
              " | Test Acc: " + str(test_correct_count / float(len(test_images))))
        

 Iteration: 0 | Train Error: 1.0583443728359492 | Train Acc: 0.126 | Test Error: 0.936 | Test Acc: 0.3081
 Iteration: 10 | Train Error: 0.6012323748188507 | Train Acc: 0.672 | Test Error: 0.664 | Test Acc: 0.7214
 Iteration: 20 | Train Error: 0.5026413085978018 | Train Acc: 0.773 | Test Error: 0.640 | Test Acc: 0.7723
 Iteration: 30 | Train Error: 0.44342354131984413 | Train Acc: 0.814 | Test Error: 0.629 | Test Acc: 0.8
 Iteration: 40 | Train Error: 0.40893057260581783 | Train Acc: 0.827 | Test Error: 0.600 | Test Acc: 0.8194
 Iteration: 50 | Train Error: 0.37951504874263114 | Train Acc: 0.855 | Test Error: 0.620 | Test Acc: 0.8321
 Iteration: 60 | Train Error: 0.3536579963111288 | Train Acc: 0.863 | Test Error: 0.595 | Test Acc: 0.8435
 Iteration: 70 | Train Error: 0.33587070217264886 | Train Acc: 0.887 | Test Error: 0.602 | Test Acc: 0.8509
 Iteration: 80 | Train Error: 0.3186871719757355 | Train Acc: 0.884 | Test Error: 0.619 | Test Acc: 0.858
 Iteration: 90 | Train Error: 0.306079

With a batch_size of 100, the CPU is doing 100 vector dot products at once for each batch of 100 datapoints. This is faster than individually computing dot products. Alpha is also allowed to be larger because taking the average weight change over 100 samples generally settles on a consistent value.