In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from sklearn import datasets

In [2]:
class ReLULayer(object):
    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the ReLU of the input
        relu = np.maximum(0,input) # your code here
        return relu

    def backward(self, upstream_gradient):
        # compute the derivative of ReLU from upstream_gradient and the stored input
        downstream_gradient = upstream_gradient * (self.input >0)  # your code here
        # for inputs >0, derivatice of ReLU =1 , otherwise =0. Hence by doing so, downstream grad which corresponds to inputs < 0 would be turned "off" ("masked")
        return downstream_gradient

    def update(self, learning_rate):
        pass # ReLU is parameter-free

<div style="color: green; font-weight: bold">Comment</div>
ReLu: Both solutions are basically the same.<br>
Downstream_gradient: Both solutions are basically the same.

In [3]:
class OutputLayer(object):
    def __init__(self, n_classes):
        self.n_classes = n_classes

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the softmax of the input
        exp_input = np.exp(input - np.max(input, axis=1, keepdims=True)) # for numerical stability
        softmax = exp_input / np.sum(exp_input, axis=1, keepdims=True) # softmax = (exp{xi}/sum_{i} exp(xi))
        return softmax

    def backward(self, predicted_posteriors, true_labels):
        # return the loss derivative with respect to the stored inputs
        # (use cross-entropy loss and the chain rule for softmax,
        #  as derived in the lecture)
        num_examples = predicted_posteriors.shape[0]
        downstream_gradient = predicted_posteriors.copy()
        # downstream gradient = posterior_predictions - OneHot(true_predictions)
        downstream_gradient[np.arange(num_examples), true_labels] -=1 
        return downstream_gradient

    def update(self, learning_rate):
        pass # softmax is parameter-free

<div style="color: green; font-weight: bold">Comment</div>
Both solutions are the same.

In [4]:
class LinearLayer(object):
    def __init__(self, n_inputs, n_outputs):
        self.n_inputs  = n_inputs
        self.n_outputs = n_outputs
        # randomly initialize weights and intercepts
        # using He initialization
        variance_w = 2.0 / n_inputs  # Variance for weights
        variance_b = 2.0 / n_inputs  # Variance for biases
        self.B = np.random.normal(0, np.sqrt(variance_w), size=(n_inputs, n_outputs))
        self.b = np.random.normal(0, np.sqrt(variance_b), size=(1, n_outputs))


    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # compute the scalar product of input and weights
        # (these are the preactivations for the subsequent non-linear layer)
        preactivations = np.dot(input, self.B) + self.b
        return preactivations

    def backward(self, upstream_gradient):
        # compute the derivative of the weights from
        # upstream_gradient and the stored input
        self.grad_b = np.sum(upstream_gradient, axis=0, keepdims=True) # your code here
        # grad_b's shape = (1, n_outputs)
        self.grad_B =  np.dot(self.input.T, upstream_gradient) # your code here
        # grad_B = z^T . ustream_grad, grad_B's shape = (n_inputs, n_outputs)
        
        # compute the downstream gradient to be passed to the preceding layer
        downstream_gradient = np.dot(upstream_gradient, self.B.T)# your code here
        # downstream_gradient = upstream_grad . B^T
        return downstream_gradient

    def update(self, learning_rate):
        # update the weights by batch gradient descent
        self.B = self.B - learning_rate * self.grad_B
        self.b = self.b - learning_rate * self.grad_b

<div style="color: green; font-weight: bold">Comment</div>
Correclty used He Initialization which is also valid. <br>
The rest is essentially the same as the provided solution.


In [5]:
class MLP(object):
    def __init__(self, n_features, layer_sizes):
        # constuct a multi-layer perceptron
        # with ReLU activation in the hidden layers and softmax output
        # (i.e. it predicts the posterior probability of a classification problem)
        #
        # n_features: number of inputs
        # len(layer_size): number of layers
        # layer_size[k]: number of neurons in layer k
        # (specifically: layer_sizes[-1] is the number of classes)
        self.n_layers = len(layer_sizes)
        self.layers   = []

        # create interior layers (linear + ReLU)
        n_in = n_features
        for n_out in layer_sizes[:-1]:
            self.layers.append(LinearLayer(n_in, n_out))
            self.layers.append(ReLULayer())
            n_in = n_out

        # create last linear layer + output layer
        n_out = layer_sizes[-1]
        self.layers.append(LinearLayer(n_in, n_out))
        self.layers.append(OutputLayer(n_out))

    def forward(self, X):
        # X is a mini-batch of instances
        batch_size = X.shape[0]
        # flatten the other dimensions of X (in case instances are images)
        X = X.reshape(batch_size, -1)

        # compute the forward pass
        # (implicitly stores internal activations for later backpropagation)
        result = X
        for layer in self.layers:
            result = layer.forward(result)
        return result

    def backward(self, predicted_posteriors, true_classes):
        # perform backpropagation w.r.t. the prediction for the latest mini-batch X
        # your code here
        
        # We first initialize downstream_gradient with the gradient from the OutputLayer
        downstream_gradient = self.layers[-1].backward(predicted_posteriors, true_classes)
        # We then backpropagate through the remaining layers in reverse order
        for layer in reversed(self.layers[:-1]):
            downstream_gradient = layer.backward(downstream_gradient)


    def update(self, X, Y, learning_rate):
        posteriors = self.forward(X)
        self.backward(posteriors, Y)
        for layer in self.layers:
            layer.update(learning_rate)

    def train(self, x, y, n_epochs, batch_size, learning_rate, print_after_num_epoch=5):
        N = len(x)
        n_batches = N // batch_size
        for i in range(n_epochs):
            # print("Epoch", i)
            # reorder data for every epoch
            # (i.e. sample mini-batches without replacement)
            permutation = np.random.permutation(N)

            for batch in range(n_batches):
                # create mini-batch
                start = batch * batch_size
                x_batch = x[permutation[start:start+batch_size]]
                y_batch = y[permutation[start:start+batch_size]]

                # perform one forward and backward pass and update network parameters
                self.update(x_batch, y_batch, learning_rate)

            # calculate training error after each epoch # added by Neel
            # Print training error after every 10 epochs
            if (i + 1) %  print_after_num_epoch == 0:
                predicted_posteriors = self.forward(x)
                predicted_classes = np.argmax(predicted_posteriors, axis=1)
                error_rate = np.mean(predicted_classes != y)
                print(f"Epoch {i+1}, Training Error Rate: {error_rate}")


<div style="color: green; font-weight: bold">Comment</div>
Both solutions are essentially the same.

In [9]:
if __name__=="__main__":

    # set training/test set size
    N = 2000

    # create training and test data
    X_train, Y_train = datasets.make_moons(N, noise=0.05)
    X_test,  Y_test  = datasets.make_moons(N, noise=0.05)
    n_features = 2
    n_classes  = 2

    # standardize features to be in [-1, 1]
    offset  = X_train.min(axis=0)
    scaling = X_train.max(axis=0) - offset
    X_train = ((X_train - offset) / scaling - 0.5) * 2.0
    X_test  = ((X_test  - offset) / scaling - 0.5) * 2.0

    # set hyperparameters (play with these!)
    layer_sizes_list=[[2,2,n_classes], [3,3,n_classes], [5,5,n_classes], [30,30,n_classes]]
    for i in range(0, len(layer_sizes_list)):
        print(f"---layer size: {layer_sizes_list[i]}")
        layer_sizes = layer_sizes_list[i]
        n_epochs = 100
        batch_size = 200
        learning_rate = 0.001#0.05

        # create network
        network = MLP(n_features, layer_sizes)

        # train
        network.train(X_train, Y_train, n_epochs, batch_size, learning_rate, print_after_num_epoch=10)

        # test
        predicted_posteriors = network.forward(X_test)
        # determine class predictions from posteriors by winner-takes-all rule
        predicted_classes = np.argmax(predicted_posteriors, axis=1)
        # compute and output the error rate of predicted_classes
        error_rate = np.mean(predicted_classes != Y_test)
        print("error rate on val set:", error_rate)


---layer size: [2, 2, 2]
Epoch 10, Training Error Rate: 0.1215
Epoch 20, Training Error Rate: 0.1115
Epoch 30, Training Error Rate: 0.109
Epoch 40, Training Error Rate: 0.1125
Epoch 50, Training Error Rate: 0.1125
Epoch 60, Training Error Rate: 0.1085
Epoch 70, Training Error Rate: 0.112
Epoch 80, Training Error Rate: 0.116
Epoch 90, Training Error Rate: 0.113
Epoch 100, Training Error Rate: 0.117
error rate on val set: 0.1095
---layer size: [3, 3, 2]
Epoch 10, Training Error Rate: 0.111
Epoch 20, Training Error Rate: 0.1105
Epoch 30, Training Error Rate: 0.1125
Epoch 40, Training Error Rate: 0.116
Epoch 50, Training Error Rate: 0.1125
Epoch 60, Training Error Rate: 0.118
Epoch 70, Training Error Rate: 0.1125
Epoch 80, Training Error Rate: 0.109
Epoch 90, Training Error Rate: 0.113
Epoch 100, Training Error Rate: 0.113
error rate on val set: 0.107
---layer size: [5, 5, 2]
Epoch 10, Training Error Rate: 0.2055
Epoch 20, Training Error Rate: 0.107
Epoch 30, Training Error Rate: 0.1065
Ep

<div style="color: green; font-weight: bold">Comment</div>
The solutions are essentially the same.

We observe that if we fix n_epochs = 100, batch_size = 200 and learning_rate = 0.001, error rate on test set decreases on increasing the number of neurons in the inner layer. We also noticed that with these set of hyperparameters, for NN architecture [2,2,n_feature], sometimes training gets stuck and error rate stays 0.5

