Lets go back to the mnist dataset and try it with a three layer network

In [1]:
import sys, numpy as np
from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

images, labels = (x_train[0:1000].reshape(1000, 28*28) / 255, y_train[0:1000])

one_hot_labels = np.zeros((len(labels), 10))

for i,l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels

test_images = x_test.reshape(len(x_test), 28*28) / 255
test_labels = np.zeros((len(y_test), 10))
for i,l in enumerate(y_test):
    test_labels[i][l] = 1

Using TensorFlow backend.


In [2]:
np.random.seed(1)
relu = lambda x: (x>=0) * x
relu2deriv = lambda x: x>=0
alpha, iterations, hidden_size, pixels_per_image, num_labels = (0.005, 35, 40, 784, 10)

weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels)) - 0.1

In [3]:
# Training iterations through the entire dataset
for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    
    for i in range(len(images)):
        layer_0 = images[i:i+1]
        layer_1 = relu(layer_0.dot(weights_0_1))
        dropout_mask = np.random.randint(2,size=layer_1.shape)
        layer_1 *= dropout_mask * 2 # Multiply by 1/%turnoff 
        layer_2 = layer_1.dot(weights_1_2)
        
        error += np.sum((labels[i:i+1] - layer_2) ** 2)
        correct_cnt += int(np.argmax(layer_2) == np.argmax(labels[i:i+1]))
        
        layer_2_delta = labels[i:i+1] - layer_2
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
        
        layer_1_delta *= dropout_mask
        
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
    
    sys.stdout.write("\r" + \
                    " I: " + str(j) + \
                    " Error: " + str(error/float(len(images)))[0:5] + \
                    " Correct: " + str(correct_cnt/float(len(images))))

 I: 34 Error: 0.480 Correct: 0.716

In [4]:
error, correct_cnt = (0.0, 0)

for i in range(len(test_images)):
    layer_0 = test_images[i:i+1]
    layer_1 = relu(layer_0.dot(weights_0_1))
    layer_2 = layer_1.dot(weights_1_2)
    
    error += np.sum((test_labels[i:i+1] - layer_2) ** 2)
    correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))

print(" Test-Error: ", str(error/float(len(test_images)))[0:5], " Test-Acc: ", correct_cnt/float(len(test_images)))


 Test-Error:  0.445  Test-Acc:  0.795


The simplest regularization is early stopping. Regularixation is a subset of methods used to encourage generalization in learned models, often by incresing the difficulty for a model to learn the fine-grained details of the training data.

Its helps the neural network learn the signal and ignore the noise of a dataset.

Validation set is a set of data that isnt inside of the training set, or the test set, used to validate the accuracy of the network as it trains. 

Another regularization method is dropout. This method works by turning off neurons at random during training, i.e setting them to 0. This might be one of the most generally accepted go-to regularization methods. On a simple level, this works because smaller networks can capture less detail and thus captures less noise and more expressive details. 

At the same time, smaller networks can overfit to a dataset. But its highly unlikely that two networks will overfit to the same noise and the average between every network will be more generalized. 

Neural networks, even though they're randomly generated, still start by learning the biggest, most broadly sweeping features before learning much about the noise. 

In [5]:
dropout_mask = np.random.randint(2,size=layer_1.shape)
print(dropout_mask)

[[0 1 1 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 0 0 1 0 1
  0 0 1 1]]


Lets rewrite the neural network with batch gradient descent

In [6]:
import numpy as np
np.random.seed(1)

def relu(x):
    return (x >= 0) * x

def relu2deriv(output):
    return output >= 0

batch_size = 100
alpha, iterations = (0.001, 300)
pixels_per_image, num_labels, hidden_size = (784, 10, 100)

weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels)) - 0.1

for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    for i in range(len(images) // batch_size):
        batch_start, batch_end = (i * batch_size, (i+1) * batch_size)
        
        layer_0 = images[batch_start:batch_end]
        layer_1 = layer_0.dot(weights_0_1)
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        layer_1 *= dropout_mask * 2
        layer_2 = layer_1.dot(weights_1_2)
        
        error += np.sum((labels[batch_start,batch_end] - layer_2) ** 2)
        
        for k in range(batch_size):
            correct_cnt += int(np.argmax(layer_2[k,k+1]) == np.argmax(labels[batch_start+k:batch_start+k+1]))
            
            layer_2_delta = (labels[batch_start:batch_end] - layer_2) / batch_size
            layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
            layer_1_delta *= dropout_mask
            
            weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
            weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
            
    if j%10 == 0:
        test_error = 0.0
        test_correct_cnt = 0
        
        for i in range(len(test_images)):
            layer_0 = test_images[i:i+1]
            layer_1 = relu(layer_0.dot(weights_0_1))
            layer_2 = layer_1.dot(weights_1_2)
            
            test_error += np.sum((test_labels[i:i+1] - layer_2) ** 2)
            test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))
            
        print("Err: ", error, " Cnt: ", correct_cnt, " Test Err: ", test_error, " Test Cnt: ", test_correct_cnt)

IndexError: index 100 is out of bounds for axis 1 with size 10