In [19]:
import numpy as np

### Questions 1

#### A: Principle of backpropagation algorithm:
Backpropagation is a process involved in training a neural network. It involves taking the error rate of a forward propagation and 
feeding this loss backward through the neural network layers to fine-tune the weights. 
#### B: The meaning and the role of the Softmax function:
It converts the neural networks predictions into probability. 

#### C: Name typically used non-linear output functions and implications of choosing one or another for implementation:
- **ReLu**: It is very efficient computationally
- **Sigmoid**: It is used for probabalistic predictions because of it's range (between 0 and 1)
- **SoftMax**: Similar to Sigmoid in that it gives a probability, but because of its summed nature it is usually used in the last layer since you usually cannot move forward with the result if it goes beyond 1
- **Hyperbolic Tanget**: Very good for mapping outputs to states between "negative", "neutral" or "positive" and is solid for hidden layers as an activation function. Since tanh is zero-centered (meaning its outputs are centered around 0), it can help in reducing the bias shift effect during training. If the activations are not zero-centered, the gradients can consistently be all positive or all negative in certain layers, which can lead to inefficient gradient descent

In [20]:
#functions of non-linear activations
def f_sigmoid(X, deriv=False):
    if not deriv:
        return 1 / (1 + np.exp(-X))
    else:
        return f_sigmoid(X)*(1 - f_sigmoid(X))


def f_softmax(X):
    Z = np.sum(np.exp(X), axis=1)
    Z = Z.reshape(Z.shape[0], 1)
    return np.exp(X) / Z

def f_relu(X, deriv=False):
    if not deriv:
        return np.maximum(0, X)
    else:
        return (X > 0).astype(float)

In [21]:
def exit_with_err(err_str):
    print >> sys.stderr, err_str
    sys.exit(1)

In [22]:
#Functionality of a single hidden layer
class Layer:
    def __init__(self, size, batch_size, is_input=False, is_output=False,
                 activation=f_sigmoid):
        self.is_input = is_input
        self.is_output = is_output

        # Z is the matrix that holds output values
        self.Z = np.zeros((batch_size, size[0]))
        # The activation function is an externally defined function (with a
        # derivative) that is stored here
        self.activation = activation

        # W is the outgoing weight matrix for this layer
        self.W = None
        # S is the matrix that holds the inputs to this layer
        self.S = None
        # D is the matrix that holds the deltas for this layer
        self.D = None
        # Fp is the matrix that holds the derivatives of the activation function
        self.Fp = None

        if not is_input:
            self.S = np.zeros((batch_size, size[0]))
            self.D = np.zeros((batch_size, size[0]))

        if not is_output:
            self.W = np.random.normal(size=size, scale=1E-4)

        if not is_input and not is_output:
            self.Fp = np.zeros((size[0], batch_size))

    def forward_propagate(self):
        if self.is_input:
            return self.Z.dot(self.W)

        self.Z = self.activation(self.S)
        if self.is_output:
            return self.Z
        else:
            # For hidden layers, we add the bias values here
            self.Z = np.append(self.Z, np.ones((self.Z.shape[0], 1)), axis=1)
            self.Fp = self.activation(self.S, deriv=True).T
            return self.Z.dot(self.W)


In [32]:
class MultiLayerPerceptron:
    def __init__(self, layer_config, batch_size=100):
        self.layers = []
        self.num_layers = len(layer_config)
        self.minibatch_size = batch_size

        for i in range(self.num_layers-1):
            if i == 0:
                print ("Initializing input layer with size {0}.".format(layer_config[i]))
                # Here, we add an additional unit at the input for the bias
                # weight.
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         is_input=True))
            else:
                print ("Initializing hidden layer with size {0}.".format(layer_config[i]))
                # Here we add an additional unit in the hidden layers for the
                # bias weight.
                # self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                #                          batch_size,
                #                          activation=f_sigmoid))
                
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         activation=f_relu))

        print ("Initializing output layer with size {0}.".format(layer_config[-1]))
        self.layers.append(Layer([layer_config[-1], None],
                                 batch_size,
                                 is_output=True,
                                 activation=f_softmax))
        print ("Done!")

    def forward_propagate(self, data):
        # We need to be sure to add bias values to the input
        self.layers[0].Z = np.append(data, np.ones((data.shape[0], 1)), axis=1)

        for i in range(self.num_layers-1):
            self.layers[i+1].S = self.layers[i].forward_propagate()
        return self.layers[-1].forward_propagate()

    def backpropagate(self, yhat, labels):
        
        # exit_with_err("FIND ME IN THE CODE, What is computed in the next line of code?\n")

        # It calculates the initial gradient of the loss function 
        # with respect to the output of the last layer (the output predictions) 
        # and stores it in the last layer's D property.

        self.layers[-1].D = (yhat - labels).T
        for i in range(self.num_layers-2, 0, -1):
            # We do not calculate deltas for the bias values
            W_nobias = self.layers[i].W[0:-1, :]

            # exit_with_err("FIND ME IN THE CODE, What does this 'for' loop do?\n")

            # It goes through the network of layers from the back and updates the deltas for the hidden layers.            
            
            self.layers[i].D = W_nobias.dot(self.layers[i+1].D) * self.layers[i].Fp

    def update_weights(self, eta):
        for i in range(0, self.num_layers-1):
            W_grad = -eta*(self.layers[i+1].D.dot(self.layers[i].Z)).T
            self.layers[i].W += W_grad

    def evaluate(self, train_data, train_labels, test_data, test_labels,
                 num_epochs=70, eta=0.05, eval_train=False, eval_test=True):

        N_train = len(train_labels)*len(train_labels[0])
        N_test = len(test_labels)*len(test_labels[0])

        print ("Training for {0} epochs...".format(num_epochs))
        for t in range(0, num_epochs):
            out_str = "[{0:4d}] ".format(t)

            for b_data, b_labels in zip(train_data, train_labels):
                output = self.forward_propagate(b_data)
                self.backpropagate(output, b_labels)
                
                # exit_with_err("FIND ME IN THE CODE, How does weight update is implemented? What is eta?\n")

                # ETA is the learning rate.
                # The update function is implemented by multiplying the gradient by the learning rate and subtracting it from the weights.

                self.update_weights(eta=eta)

            if eval_train:
                errs = 0
                for b_data, b_labels in zip(train_data, train_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Training error: {1:.5f}".format(out_str,
                                                           float(errs)/N_train))

            if eval_test:
                errs = 0
                for b_data, b_labels in zip(test_data, test_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Test error: {1:.5f}").format(out_str,
                                                       float(errs)/N_test)

            print (out_str)


In [24]:
def label_to_bit_vector(labels, nbits):
    bit_vector = np.zeros((labels.shape[0], nbits))
    for i in range(labels.shape[0]):
        bit_vector[i, labels[i]] = 1.0

    return bit_vector

In [25]:
def create_batches(data, labels, batch_size, create_bit_vector=False):
    N = data.shape[0]
    print ("Batch size {0}, the number of examples {1}.".format(batch_size,N))

    if N % batch_size != 0:
        print ("Warning in create_minibatches(): Batch size {0} does not " \
              "evenly divide the number of examples {1}.".format(batch_size,N))
    chunked_data = []
    chunked_labels = []
    idx = 0
    while idx + batch_size <= N:
        chunked_data.append(data[idx:idx+batch_size, :])
        if not create_bit_vector:
            chunked_labels.append(labels[idx:idx+batch_size])
        else:
            bit_vector = label_to_bit_vector(labels[idx:idx+batch_size], 10)
            chunked_labels.append(bit_vector)

        idx += batch_size

    return chunked_data, chunked_labels


In [26]:
def prepare_for_backprop(batch_size, Train_images, Train_labels, Valid_images, Valid_labels):
    
    print ("Creating data...")
    batched_train_data, batched_train_labels = create_batches(Train_images, Train_labels,
                                              batch_size,
                                              create_bit_vector=True)
    batched_valid_data, batched_valid_labels = create_batches(Valid_images, Valid_labels,
                                              batch_size,
                                              create_bit_vector=True)
    print ("Done!")


    return batched_train_data, batched_train_labels,  batched_valid_data, batched_valid_labels

def get_accuracy(model, X, y):
    yhat = model.forward_propagate(X)
    yhat = np.argmax(yhat, axis=1)
    accuracy = np.sum(yhat == y) / float(len(y))
    print("Accuracy: {0:.4f}".format(accuracy))
    return accuracy


In [27]:
from keras.datasets import mnist

In [28]:
(Xtr, Ltr), (X_test, L_test)=mnist.load_data()

Xtr = Xtr.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
Xtr = Xtr.astype('float32')
X_test = X_test.astype('float32')
Xtr /= 255
X_test /= 255
print(Xtr.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')


60000 train samples
10000 test samples


In [29]:
batch_size=100

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True)

get_accuracy(mlp, X_test, L_test)

print("Done:)\n")
    


Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...


[   0]  Training error: 0.46972 Test error: 0.47070
[   1]  Training error: 0.07760 Test error: 0.07590
[   2]  Training error: 0.04945 Test error: 0.05290
[   3]  Training error: 0.04372 Test error: 0.04740
[   4]  Training error: 0.03355 Test error: 0.03800
[   5]  Training error: 0.03038 Test error: 0.03970
[   6]  Training error: 0.02590 Test error: 0.03690
[   7]  Training error: 0.02168 Test error: 0.03310
[   8]  Training error: 0.02067 Test error: 0.03440
[   9]  Training error: 0.02007 Test error: 0.03360
[  10]  Training error: 0.02150 Test error: 0.03700
[  11]  Training error: 0.01632 Test error: 0.03440
[  12]  Training error: 0.01758 Test error: 0.03280
[  13]  Training error: 0.02112 Test error: 0.03510
[  14]  Training error: 0.01668 Test error: 0.03280
[  15]  Training error: 0.01080 Test error: 0.03160
[  16]  Training error: 0.01162 Test error: 0.03210
[  17]  Training error: 0.01190 Test error: 0.03230
[  18]  Training error: 0.01195 Test error: 0.03170
[  19]  Trai

In [30]:
batch_size=100

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True, eta=0.5)

get_accuracy(mlp, X_test, L_test)

print("Done:)\n")


Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.89782 Test error: 0.89900
[   1]  Training error: 0.90085 Test error: 0.89910
[   2]  Training error: 0.90128 Test error: 0.90200
[   3]  Training error: 0.90085 Test error: 0.89910
[   4]  Training error: 0.90137 Test error: 0.90420
[   5]  Training error: 0.90128 Test error: 0.90200
[   6]  Training error: 0.88763 Test error: 0.88650
[   7]  Training error: 0.90137 Test error: 0.90420
[   8]  Training error: 0.89782 Test error: 0.89900
[   9]  Training error: 0.90263 Test error: 0.90180
[  10]  Training error: 0.90085 Test error: 0.89910
[  11]  Training error: 0.89782 Test error: 0.89900
[  12]  Training error: 0.89782 Test error: 0.89900
[  13]  Trainin

In [31]:
batch_size=100

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True, eta=0.005)

get_accuracy(mlp, X_test, L_test)

print("Done:)\n")

Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.70343 Test error: 0.70080
[   1]  Training error: 0.64710 Test error: 0.64380
[   2]  Training error: 0.59973 Test error: 0.59910
[   3]  Training error: 0.45603 Test error: 0.46660
[   4]  Training error: 0.20322 Test error: 0.19170
[   5]  Training error: 0.11390 Test error: 0.11040
[   6]  Training error: 0.09023 Test error: 0.08900
[   7]  Training error: 0.07532 Test error: 0.07420
[   8]  Training error: 0.06420 Test error: 0.06520
[   9]  Training error: 0.05513 Test error: 0.05750
[  10]  Training error: 0.04828 Test error: 0.05070
[  11]  Training error: 0.04290 Test error: 0.04760
[  12]  Training error: 0.03870 Test error: 0.04340
[  13]  Trainin

In [33]:
batch_size=100

print("With ReLu\n")

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True)

get_accuracy(mlp, X_test, L_test)

print("Done:)\n")

With ReLu

Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.90137 Test error: 0.90420
[   1]  Training error: 0.90137 Test error: 0.90420
[   2]  Training error: 0.90137 Test error: 0.90420
[   3]  Training error: 0.90137 Test error: 0.90420
[   4]  Training error: 0.90137 Test error: 0.90420
[   5]  Training error: 0.90137 Test error: 0.90420
[   6]  Training error: 0.90137 Test error: 0.90420
[   7]  Training error: 0.90137 Test error: 0.90420
[   8]  Training error: 0.90137 Test error: 0.90420
[   9]  Training error: 0.90137 Test error: 0.90420
[  10]  Training error: 0.90137 Test error: 0.90420
[  11]  Training error: 0.90137 Test error: 0.90420
[  12]  Training error: 0.90137 Test error: 0.90420
[  1