# Neural Networks

In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat

data = loadmat('machine-learning-ex4/ex4/ex4data1.mat')
data

{'X': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
 '__version__': '1.0',
 'y': array([[10],
        [10],
        [10],
        ...,
        [ 9],
        [ 9],
        [ 9]], dtype=uint8)}

In [90]:
X = data['X']
y = data['y']

X.shape, y.shape

((5000, 400), (5000, 1))

Turn class label n(out of k classes) into a vector of length k where index n is "hot"(1) while the rest are zero, 
One-hot encode y labels

In [91]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
y_onehot.shape, y[0], y_onehot[0, :]

((5000, 10),
 array([10], dtype=uint8),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]))

Neural network with input layer matching the size of our instance data(400 + 1), a hidden layer with 25(+1) units and an output layer with 10 units corresponding to each class label.

In [92]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [93]:
# computes the gradient of the sigmoid function
def sigmoidGradient(z):
    return np.multiply(sigmoid(z), (1 - sigmoid(z)))

In [94]:
def nnCostFunction(nn_params, 
                  input_layer_size, 
                  hidden_layer_size,
                  num_labels,
                  X, y, lam):
    m = X.shape[0]
    X = np.matrix(X) # (m, n+1)
    y = np.matrix(y) # (m, k)
    
    # unroll and reshape nn_params into theta1 and theta2
    theta1 = np.matrix(np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
                                 (hidden_layer_size, (input_layer_size + 1))))
    theta2 = np.matrix(np.reshape(nn_params[(hidden_layer_size * (input_layer_size + 1)):],
                                 (num_labels, (hidden_layer_size + 1))))
    
    # grad values
    theta1_grad = np.zeros(theta1.shape) # (25, n+1)
    theta2_grad = np.zeros(theta2.shape) # (k, 25+1)
    
    # run feed-forward pass
    a1 = np.insert(X, 0, 1, axis=1) # (m, n+1)
    z2 = a1 * theta1.T
    a2 = np.insert(sigmoid(z2), 0, 1, axis=1) # (m, 25+1)
    z3 = a2 * theta2.T
    h = sigmoid(z3) # (m, k)
    
    # compute the cost
    term1 = np.multiply(-y, np.log(h))
    term2 = np.multiply((1 - y), np.log(1 - h))
    J = np.sum(term1 - term2) / m
    
    # add the cost regularization
    J += (lam/(2*m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)))
    
    
    # back-propagation (vectorized)
    delta3 = h - y # (m, k)
    theta2_grad = (delta3.T * a2) / m
    
    # (m, 25)
    delta2 = np.multiply((delta3 * theta2[:, 1:]), sigmoidGradient(z2))
    theta1_grad = (delta2.T * a1) / m
    
    # regularise the gradients
    theta1_grad[:,1:] = theta1_grad[:,1:] + (lam/m) * theta1[:,1:]
    theta2_grad[:,1:] = theta2_grad[:,1:] + (lam/m) * theta2[:,1:]
    
    # unroll the gradient matrices into single array
    grad = np.concatenate((np.ravel(theta1_grad), np.ravel(theta2_grad)))
    
    return J, grad

In [95]:
# initial setup
n = 400
hidden_size = 25
k = 10
lam = 1
epsilon_init = 0.12

# randomly initialize a parameter array of the size of the full network's parameters
nn_params = np.random.random(size=hidden_size * (n + 1) + k * (hidden_size + 1)) * 2 * epsilon_init - epsilon_init

In [96]:
# test the nnCostFunction
J, grad = nnCostFunction(nn_params, n, hidden_size, k, X, y, lam)
J, grad.shape

(-205.9441712493779, (10285,))

In [97]:
# Train the network
from scipy.optimize import minimize

fmin = minimize(fun=nnCostFunction, x0=nn_params, args=(n, hidden_size, k, X, y_onehot, lam), 
               method='TNC', jac=True, options={'maxiter': 250})
fmin

     fun: 0.33368182385038
     jac: array([ 1.05309591e-04,  1.86369043e-06,  1.67382359e-06, ...,
        1.80675175e-04, -1.80303129e-04, -5.39350949e-05])
 message: 'Max. number of function evaluations reached'
    nfev: 250
     nit: 20
  status: 3
 success: False
       x: array([-2.00382484,  0.00931845,  0.00836912, ..., -0.39113819,
        0.2902884 ,  2.0402121 ])

In [100]:
X = np.matrix(X)
theta1 = np.matrix(np.reshape(fmin.x[:hidden_size * (n + 1)],
                             (hidden_size, (n + 1))))
theta2 = np.matrix(np.reshape(fmin.x[(hidden_size * (n + 1)):],
                             (k, (hidden_size + 1))))

# run feed-forward pass
a1 = np.insert(X, 0, 1, axis=1) # (m, n+1)
z2 = a1 * theta1.T
a2 = np.insert(sigmoid(z2), 0, 1, axis=1) # (m, 25+1)
z3 = a2 * theta2.T
h = sigmoid(z3)

y_pred = np.array(np.argmax(h, axis=1) + 1)
y_pred

array([[10],
       [10],
       [10],
       ...,
       [ 9],
       [ 9],
       [ 9]])

In [101]:
correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y)]
accuracy = sum(correct) / len(correct)
(accuracy * 100)

99.28