
# Multi Layer Perceptron (MLP)

The quintessential example of a deep learning model is the deep feedforward network, multilayer
perceptron (MLP). A multilayer perceptron is simply a mathematical function that maps a set of
input values to output values.

The function is formed by putting many simpler functions together. We can think of each
application of a different mathematical function as a new representation of the input.

First, we are going to develop our own library to carry out the definition, training and prediction of
our own MLP deep neural network.

In [None]:
import numpy as np


Define the activation functions to be used

* sigmoid
* sigmoid_derivative
* relu

In [1]:

class Mlp():
    '''
    fully-connected Multi-Layer Perceptron (MLP)
    '''

    def __init__(self, size_layers, act_funct='sigmoid', reg_lambda=0, bias_flag=True):
        '''
        Constructor method. Defines the characteristics of the MLP
        Arguments:
            size_layers : List with the number of Units for:
                [Input, Hidden1, Hidden2, ... HiddenN, Output] Layers.
            act_funtc   : Activation function for all the Units in the MLP
                default = 'sigmoid'
            reg_lambda: Value of the regularization parameter Lambda
                default = 0, i.e. no regularization
            bias: Indicates is the bias element is added for each layer, but the output
        '''
        self.size_layers = size_layers
        self.n_layers    = len(size_layers)
        self.act_f       = act_funct
        self.lambda_r    = reg_lambda
        self.bias_flag   = bias_flag

        # Ramdomly initialize theta (MLP weights)
        self.initialize_theta_weights()

    def initialize_theta_weights(self):
        '''
        Initialize theta_weights, initialization method depends
        on the Activation Function and the Number of Units in the current layer
        and the next layer.
        The weights for each layer as of the size [next_layer, current_layer + 1]
        '''
        self.theta_weights = []
        size_next_layers = self.size_layers.copy()
        size_next_layers.pop(0)
        for size_layer, size_next_layer in zip(self.size_layers, size_next_layers):
            if self.act_f == 'sigmoid':
                # Method presented "Understanding the difficulty of training deep feedforward neurla networks"
                # Xavier Glorot and Youshua Bengio, 2010
                epsilon = 4.0 * np.sqrt(6) / np.sqrt(size_layer + size_next_layer)
                # Weigts from a uniform distribution [-epsilon, epsion]
                if self.bias_flag:
                    theta_tmp = epsilon * ( (np.random.rand(size_next_layer, size_layer + 1) * 2.0 ) - 1)
                else:
                    theta_tmp = epsilon * ( (np.random.rand(size_next_layer, size_layer) * 2.0 ) - 1)
            elif self.act_f == 'relu':
                # Method presented in "Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classfication"
                # He et Al. 2015
                epsilon = np.sqrt(2.0 / (size_layer * size_next_layer) )
                # Weigts from Normal distribution mean = 0, std = epsion
                if self.bias_flag:
                    theta_tmp = epsilon * (np.random.randn(size_next_layer, size_layer + 1 ))
                else:
                    theta_tmp = epsilon * (np.random.randn(size_next_layer, size_layer))
            self.theta_weights.append(theta_tmp)
        return self.theta_weights

    def train(self, X, Y, iterations=400, reset=False):
        '''
        Given X (feature matrix) and y (class vector)
        Updates the Theta Weights by running Backpropagation N tines
        Arguments:
            X          : Feature matrix [n_examples, n_features]
            Y          : Sparse class matrix [n_examples, classes]
            iterations : Number of times Backpropagation is performed
                default = 400
            reset      : If set, initialize Theta Weights before training
                default = False
        '''

        if reset:
            self.initialize_theta_weights()
        for iteration in range(iterations):
            self.gradients = self.backpropagation(X, Y)
            self.gradients_vector = self.unroll_weights(self.gradients)
            self.theta_vector = self.unroll_weights(self.theta_weights)
            self.theta_vector = self.theta_vector - self.gradients_vector
            self.theta_weights = self.roll_weights(self.theta_vector)

    def predict(self, X):
        '''
        Given X (feature matrix), y_hay is computed
        Arguments:
            X      : Feature matrix [n_examples, n_features]
        Output:
            y_hat  : Computed Vector Class for X
        '''
        A , Z = self.feedforward(X)
        Y_hat = A[-1]
        return Y_hat

    def backpropagation(self, X, Y):
        '''
        Implementation of the Backpropagation algorithm with regularization
        '''
        if self.act_f == 'sigmoid':
            g_dz = lambda x: self.sigmoid_derivative(x)
        elif self.act_f == 'relu':
            g_dz = lambda x: self.relu_derivative(x)

        n_examples = X.shape[0]
        # Feedforward
        A, Z = self.feedforward(X)

        # Backpropagation
        deltas = [None] * self.n_layers
        deltas[-1] = A[-1] - Y
        # For the second last layer to the second one
        for ix_layer in np.arange(self.n_layers - 1 - 1 , 0 , -1):
            theta_tmp = self.theta_weights[ix_layer]
            if self.bias_flag:
                # Removing weights for bias
                theta_tmp = np.delete(theta_tmp, np.s_[0], 1)
            deltas[ix_layer] = (np.matmul(theta_tmp.transpose(), deltas[ix_layer + 1].transpose() ) ).transpose() * g_dz(Z[ix_layer])

        # Compute gradients
        gradients = [None] * (self.n_layers - 1)
        for ix_layer in range(self.n_layers - 1):
            grads_tmp = np.matmul(deltas[ix_layer + 1].transpose() , A[ix_layer])
            grads_tmp = grads_tmp / n_examples
            if self.bias_flag:
                # Regularize weights, except for bias weigths
                grads_tmp[:, 1:] = grads_tmp[:, 1:] + (self.lambda_r / n_examples) * self.theta_weights[ix_layer][:,1:]
            else:
                # Regularize ALL weights
                grads_tmp = grads_tmp + (self.lambda_r / n_examples) * self.theta_weights[ix_layer]
            gradients[ix_layer] = grads_tmp;
        return gradients

    def feedforward(self, X):
        '''
        Implementation of the Feedforward
        '''
        if self.act_f == 'sigmoid':
            g = lambda x: self.sigmoid(x)
        elif self.act_f == 'relu':
            g = lambda x: self.relu(x)

        A = [None] * self.n_layers
        Z = [None] * self.n_layers
        input_layer = X

        for ix_layer in range(self.n_layers - 1):
            n_examples = input_layer.shape[0]
            if self.bias_flag:
                # Add bias element to every example in input_layer
                input_layer = np.concatenate((np.ones([n_examples ,1]) ,input_layer), axis=1)
            A[ix_layer] = input_layer
            # Multiplying input_layer by theta_weights for this layer
            Z[ix_layer + 1] = np.matmul(input_layer,  self.theta_weights[ix_layer].transpose() )
            # Activation Function
            output_layer = g(Z[ix_layer + 1])
            # Current output_layer will be next input_layer
            input_layer = output_layer

        A[self.n_layers - 1] = output_layer
        return A, Z


    def unroll_weights(self, rolled_data):
        '''
        Unroll a list of matrices to a single vector
        Each matrix represents the Weights (or Gradients) from one layer to the next
        '''
        unrolled_array = np.array([])
        for one_layer in rolled_data:
            unrolled_array = np.concatenate((unrolled_array, one_layer.flatten('F')) )
        return unrolled_array

    def roll_weights(self, unrolled_data):
        '''
        Unrolls a single vector to a list of matrices
        Each matrix represents the Weights (or Gradients) from one layer to the next
        '''
        size_next_layers = self.size_layers.copy()
        size_next_layers.pop(0)
        rolled_list = []
        if self.bias_flag:
            extra_item = 1
        else:
            extra_item = 0
        for size_layer, size_next_layer in zip(self.size_layers, size_next_layers):
            n_weights = (size_next_layer * (size_layer + extra_item))
            data_tmp = unrolled_data[0 : n_weights]
            data_tmp = data_tmp.reshape(size_next_layer, (size_layer + extra_item), order = 'F')
            rolled_list.append(data_tmp)
            unrolled_data = np.delete(unrolled_data, np.s_[0:n_weights])
        return rolled_list

    def sigmoid(self, z):
        '''
        Sigmoid function
        z can be an numpy array or scalar
        '''
        result = 1.0 / (1.0 + np.exp(-z))
        return result

    def relu(self, z):
        '''
        Rectified Linear function
        z can be an numpy array or scalar
        '''
        if np.isscalar(z):
            result = np.max((z, 0))
        else:
            zero_aux = np.zeros(z.shape)
            meta_z = np.stack((z , zero_aux), axis = -1)
            result = np.max(meta_z, axis = -1)
        return result

    def sigmoid_derivative(self, z):
        '''
        Derivative for Sigmoid function
        z can be an numpy array or scalar
        '''
        result = self.sigmoid(z) * (1 - self.sigmoid(z))
        return result

    def relu_derivative(self, z):
        '''
        Derivative for Rectified Linear function
        z can be an numpy array or scalar
        '''
        result = 1 * (z > 0)
        return result




#  Practical example

This time, an MLP network will be used, using the code developed in the previous practical
example, including this one as a library.

In [3]:
import numpy as np
import pickle,gzip
import matplotlib.pyplot as plt
import os
import urllib.request

In [None]:
mnist_filename = 'mnist.pkl'

# As ‘mnist.pkl.gz' was created in Python2, ‘latin1' encoding is needed to loaded in Python3
with open(mnist_filename, 'rb') as f:
  train_set, valid_set, test_set = pickle.load(f, encoding='latin1')

Show 8 random data from the database with which we will work.

In [None]:
# Plot random examples
examples = np.random. randint(10000, size=8)
n_examples = len(examples)
plt.figure()

for ix_example in range(n_examples):
  tmp = np.reshape(train_set[0] [examples [ix_example],:], [28,28])
  ax = plt.subplot(1,n_examples, ix_example + 1)
  ax. set_yticklabels([])
  ax. set_xticklabels([])
  plt.title(str(train_set[1] [examples [ix_example] ]))
  plt.imshow(tmp, cmap='gray')

Split the data into data used for training and data used for testing.

In [None]:
# Training data
train_X = valid_set[0]
train_y = valid_set[1]
print('Shape of training set: ' + str(train_X. shape) )


# change y [1D] to Y [2D] sparse array coding class
n_examples = len(train_y)
labels = np.unique(train_y)
train_Y = np.zeros((n_examples, len(labels)))
for ix_label in range(len(labels)):
  # Find examples with with a Label = lables(ix_label)
  ix_tmp = np.where(train_y == labels [ix_label]) [0]
  train_Y[ix_tmp, ix_label] = 1

# Test data
test_X = test_set[0]
test_y = test_set[1]

print ('Shape of test set: ' + str(test_X.shape))

# change y [1D] to Y [2D] sparse array coding class
n_examples = len(test_y)
labels = np_unique(test_y)

test_Y = np.zeros((n_examples, len(labels)))

for ix_label in range(len(labels)):
  # Find examples with with a Label = lables(ix_label)
  ix_tmp = np.where(test_y == labels[ix_label]) [0]
  test_Y[ix_tmp, ix_label] = 1

Create the multilayer perceptron network with 4 layers (input, two hidden and output) and use the
“relu” function as the activation function in all of them.

In [None]:
# Creating the MLP object initialize the
mlp_classifier = Mlp(size_layers = [784, 25, 10, 10],act_funct = 'relu',reg_lambda = 6,bias_flag = True)

Train the network with the training data.

In [None]:
# Training with Backpropagation and 460 iterations
iterations = 400
loss = np.zeros([iterations,1])
for ix in range(iterations):
  mlp_classifier.train(train_X, train_Y, 1)
  Y_hat = mlp_classifier.predict(train_X)
  y_tmp = np.argmax(Y_hat, axis=1)
  _hat = labels[y_tmp]
  loss[ix] = (0.5)*np.square(y_hat - train_y).mean()


Plot the evolution of the cost function.

In [None]:
# Ploting loss vs iterations
plt.figure()
ix = np.arange(iterations)
plt.plot(ix, loss)

Show some of the results obtained.

In [None]:
# Some test samples, [T]rue labels and [P]redicted labels
examples = np.random. randint(10000, size=8)
n_examples = len(examples)
plt.figure()

for ix_example in range(n_examples):
  tmp = np.reshape(test_X[examples[ix_example],:], [28,28])
  ax = plt.subplot(1,8, ix_example + 1)
  ax. set_yticklabels([])
  ax. set_xticklabels([])
  plt.title('T'+ str(test_y[examples[ix_example]]) + ', P' + str(y_hat[examples[ix_example]]))
  plt.imshow(tmp, cmap='gray')