# ML LabExericise --- Neural Networks
This notebook exercise has been developed by Simon Pauw and Rein van den Boomgaard

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.datasets import load_digits



## Introduction

In this notebook, you are going to implement a feedforward neural network with an arbitrary number of layers (be gentle on your laptop...), and with an arbitrary number of nodes in each layer. We will use the modern way of deriving the equations but the classical way of implementation... (i.e. we are not going to use a machine learning framework like Tensorflow or PyTorch).

## Implementation

First, implement the activation function ``sigm`` and its derivative ``sigm_prime``.

In [None]:
def sigm(v):
    # return the sigmoid function value
    #. Your solution here ...

def sigm_prime(v):
    # return the derivative of the sigmoid function
    #. Your solution here ...

In [None]:
# plot functions
x = np.arange(-5,5,0.1)
plt.plot(x, sigm(x))
plt.plot(x, sigm_prime(x));

The `NN` class here below is not yet fully implemented. Your goal is to write all missing code. **Follow the steps outlined below the class.**

In [None]:
class NN(object):
    def __init__(self, file=None, layerSizes=[2,2,1],
                 activation_functions=(sigm, sigm_prime),
                 seed=None):
        if file is not None:
            self.loadNetwork(file)
        else:
            self.layerSizes = layerSizes
            self.numberOfLayers = len(layerSizes)
            self.W = []
            self.b = []
            if seed is not None:
                rng = np.random.default_rng(seed=seed)
            else:
                rng = np.random.default_rng()

            for s_in, s_out in zip(layerSizes[:-1], layerSizes[1:]):
                if seed == 0:
                    W = np.zeros((s_out, s_in))
                    b = np.zeros(s_out)
                else:
                    sigma = 2 * np.sqrt(6/(s_in+s_out))
                    W = sigma * rng.normal(size=(s_out, s_in))
                    b = sigma * rng.normal(size=s_out)

                self.W.append(W)
                self.b.append(b)

        self.g, self.g_prime = activation_functions

    def getLayerSizes(self):
        """get the sizes of layers (including the input 'layer')"""
        ls = [w.shape[1] for w in self.W]
        ls.append(self.W[-1].shape[0])
        return ls

    def predict(self, X, save=False):
        A = X
        if save:
            # create empty lists for intemediary results
            self.A = self.numberOfLayers * [0]
            self.Z = self.numberOfLayers * [0]
            # store input values
            self.A[0] = A

        # now calculate output of all layers (all in variable A) and return last one
        for i in range(self.numberOfLayers - 1):
            # calculate Z and A
            #. Your solution here ...
            if save:
                # store intemediary results (for backprop)
                self.Z[i] = Z
                self.A[i+1] = A
        return A

    def backprop(self, Y):
        L = self.numberOfLayers
        m = len(Y)
        self.D = L * [0]

        # Compute loss of final layer and assign to self.D[L-1]
        #. Your solution here ...

        # iterate backwards through layers
        for i in range(L-2,0,-1):
            # Calculate the derivatives of the loss
            # with respect to output of layers
            # and assign tp self.D[i]
            #. Your solution here ...

    def grad_descent(self, alpha):
        L = len(self.layerSizes)
        for i in range(0,L-1):
            # Update the values of self.b[i] and self.W[i]
            #. Your solution here ...

    def fit(self, X, Y, niter, alpha, verbose=False, testset=None):
        if verbose:
            cost_learn = np.zeros((niter))
            cost_test = np.zeros((niter))
        else:
            cost_learn = None
            cost_test = None

        for i in range(niter):
            A = self.predict(X, save=True)
            self.backprop(Y)
            self.grad_descent(alpha)
            if verbose:
                cost_learn[i] = 1 / 2 / len(Y) * np.sum((A-Y)**2)
                A_test = self.predict(testset[0], save=False)
                cost_test[i] = 1 / 2/ len(testset[1]) * np.sum((A_test-testset[1])**2)
                if i % (niter//10) == 0:
                    print(f"iteration {i}, cost on learning set = {cost_learn[i]:10.4f}, cost on test set {cost_test[i]:10.4f}")

        return cost_learn, cost_test

    def loadNetwork(self, file):
        (self.W, self.b) = pickle.load(open(file, 'rb'))
        self.layerSizes = self.getLayerSizes()

    def saveNetwork(self, file):
        pickle.dump((self.W, self.b), open(file, 'wb+'))

    def __str__(self):
        s = ""
        s += "Neural Network\n"
        s += "=" * 70 + "\n"
        s += "nodes in layers: " + str(self.layerSizes) + "\n"
        l = 1
        for W,b in zip(self.W, self.b):
            s += "Layer " + str(l) + " to " + str(l+1) + "\n"
            s += "\t W = " + str(W) + "\n"
            s += "\t b = " + str(b) + "\n"
            l += 1
        s += "=" * 70 + "\n"
        return s

### Step 1: prediction

Finish the implementation of the `predict` method. You can use the code below to test if it works correctly.

In [None]:
# manually create network
nn1 = NN(layerSizes=(2,2,1))

W1 = np.array([[20.0, 20.0], [-20.0, -20.0]])
W2 = np.array([[20.0, 20.0]])
b1 = np.array([-10,30])
b2 = np.array([-30])

nn1.W = [W1, W2]
nn1.b = [b1, b2]

# create test data
X = np.array([[0,0], [0,1], [1,0], [1,1]]).astype(float)

# predict output
Y_output = nn1.predict(X)
print(Y_output)

### Step 2: backprop

Finish the implementation of the method `backprop`. You can use the code below to test if it works correctly.

In [None]:
# create network (with fixed seed ---needed for automatic grading---)
nn2 = NN(layerSizes=(2,2,1), seed=42)

print(nn2)

# create test data
X = np.array([[0,0], [0,1], [1,0], [1,1]]).astype(float)
Y = np.array([[0], [1], [1], [0]])

# save the results of the forward propagation
nn2.predict(X, save = True)

# run back propagation to compute the derivatives
# of the (intermediary) activations
nn2.backprop(Y)

D_layer2 = nn2.D[1]  # these values will be tested
D_layer3 = nn2.D[2]  # these values will be tested

# print the losses
print(D_layer2)
print(D_layer3)

### Step 3: descent

Finish the implementation of the method `grad_descent`. You can use the code below to test if it works correctly. There is no correct answer defined anymore. It's up to you to decide if it works well enough.

In [None]:
# create data
X = np.array([[0,0], [0,1], [1,0], [1,1]]).astype(float)
Y = np.array([[0], [1], [1], [0]]).astype(float)

# define network
np.random.seed(73478)
nn3 = NN(layerSizes=(2,2,1), activation_functions=(sigm, sigm_prime))

# show initial state
print("Network before learning:")
print(nn3)
print("Prediction before learning")
print((1000 * nn3.predict(X)).astype(int))
print()

# train network
nn3.fit(X,Y,10000, 10)

# results
print("Netwok after learning:")
print(nn3)
print("Prediction after learning")
print((1000 * nn3.predict(X)).astype(int))

## Testing on Noisy XOR Problem

First we generate the noisy 'XOR' data, run the neural net classifier on this problem, and then visualize the results. In the cell below you only need to check if your implementation works correctly.

In [None]:
m = 100
X0 = 1.7 * np.random.randn(m//2, 2) + (9, 9)
y0 = np.zeros((m//2))
X1 = 1.7 * np.random.randn(m//2, 2) + (1, 1)
y1 = np.zeros((m//2))
X = np.vstack((X0, X1))
y = np.hstack((y0, y1))
X0 = 1.5 * np.random.randn(m//2, 2) + (1, 9)
y0 = np.ones((m//2))
X1 = 1.5 * np.random.randn(m//2, 2) + (9, 1)
y1 = np.ones((m//2))
X = np.vstack((X,X0,X1))
y = np.hstack((y,y0,y1))
plt.scatter(X[:,0], X[:,1], c=y, edgecolors='k', cmap=plt.cm.Paired);

In [None]:
Y = y[:,np.newaxis] # reshape into (m,1) for neural network learning

In [None]:
nnXORnoisy = NN(layerSizes=(2,8,1))
nnXORnoisy.fit(X, Y, 10000, 10)
print(nnXORnoisy)

In [None]:
xmin = X[:,0].min() - 0.5
xmax = X[:,0].max() + 0.5
ymin = X[:,1].min() - 0.5
ymax = X[:,1].max() + 0.5
mx, my = np.meshgrid(np.arange(xmin, xmax, 0.1),
                  np.arange(ymin, ymax, 0.1))
Z = nnXORnoisy.predict(np.c_[mx.ravel(), my.ravel()])
Z = 1*(Z>0.5)
Z = Z.reshape(mx.shape)
plt.pcolormesh(mx, my, Z, cmap=plt.cm.Paired, shading='auto');
plt.scatter(X[:,0], X[:,1], c=y, edgecolors='k', cmap=plt.cm.Paired);

If all went well, you should see that the classification only went wrong for one point.

## The MNIST Data Set

The MNIST data set consists of small images of handwritten digits and is considered the "hello world!" for classification programs in machine learning.

In [None]:
dataset_mnist = load_digits()
print(dataset_mnist.DESCR)

It is important to normalize the dataset before using it for learning and testing. For the learning set, we do the normalization by subtracting the mean of a feature and divide by the standard deviation of that feature (like the z-score). **Note that we have to use the mean and standard deviation of the learning set to normalize the test set as well!**

In [None]:
X_mnist = dataset_mnist.data.astype(float)
y_mnist = dataset_mnist.target
m_mnist = len(y_mnist)
m_mnist_learn = 2 * m_mnist // 3
m_mnist_test = m_mnist - m_mnist_learn

# First we divide the data into learning and test set
idx = np.arange(m_mnist)
rng = np.random.default_rng(seed=487)  # needed to have everybode work with the same learning and test set
rng.shuffle(idx)
X_mnist_learn = X_mnist[idx[:m_mnist_learn]]
y_mnist_learn = y_mnist[idx[:m_mnist_learn]]
X_mnist_test = X_mnist[idx[m_mnist_learn:]]
y_mnist_test = y_mnist[idx[m_mnist_learn:]]

# Normalize both the learning set and the test set
# do the normalization 'in place'
#. Your solution here ...

The target vector has values from 0 to 9 indicating the digits. We could use this as the target value for the neural network as well, but then we are using a neural network as a regression system. For classification, it is far better to have as many nodes in the final layer as there are classes (10). The first value in the last layer then indicates whether the example presented at the input belongs to class '0', etc. A value of 1 then indicates certainty that is indeed a '0'. A value of 0 indicates it certainly is not a '0'. In a sense, the output of node i (numbering starting at 0) then is proportional to $P(Y=i|\bf{X}=\bf{x})$, the a posteriori probability that the image characterized by data vector $\bf x$ belongs to class $i$.

The target vector $\bf y$ is therefore changed into target matrix $Y$, where each example in the dataset has a target values a vector of 10 elements with all zeros except for the $i$-th element if that example belongs to class $i$. A simple example:
$$\bf y = \begin{bmatrix}0\\3\\5\\2\\7\end{bmatrix}$$
leads to
$$Y = \begin{bmatrix}
1& 0& 0& 0& 0& 0& 0& 0& 0& 0\\
0&0&0&1&0&0&0&0&0&0\\
0&0&0&0&0&1&0&0&0&0\\
0&0&1&0&0&0&0&0&0&0\\
0&0&0&0&0&0&0&1&0&0\end{bmatrix}$$
Such encoding of a target vector is called **one-hot encoding.** In the function complete the `one_hot_encoding` function. (Note this can be done without writing a loop in Python).

In [None]:
def one_hot_encoding(y):
    #. Your solution here ...

In [None]:
Y_mnist_learn = one_hot_encoding(y_mnist_learn)
Y_mnist_test = one_hot_encoding(y_mnist_test)

In [None]:
i=500 # pick any number in the data set
plt.imshow(dataset_mnist.images[i], interpolation='nearest'); plt.gray();
plt.text(0,7,str(dataset_mnist.target[i]),bbox=dict(facecolor='yellow', alpha=0.9)); plt.axis('off');
print(dataset_mnist.images[i])

To show a somewhat nicer image, change the interpolation flag from 'nearest' into 'gaussian'. Interpolation (aka estimating values in between the pixels) is a subject in the course on image processing and computer vision (ICV).

In [None]:
print(X_mnist_learn.shape)
print(Y_mnist_learn.shape)
nn = NN(layerSizes=(64, 10), seed=77) # this is (almost) just 10 logistic regression units in parallel
cost_mnist_learn, cost_mnist_test = nn.fit(X_mnist_learn, Y_mnist_learn,
                                           5000, 10, verbose=True,
                                           testset=(X_mnist_test, Y_mnist_test))
plt.plot(cost_mnist_learn, label='learning cost')
plt.plot(cost_mnist_test, label='test cost')
plt.ylim(0,1)
plt.legend()

In [None]:
plt.plot(cost_mnist_learn, label='learning cost')
plt.plot(cost_mnist_test, label='test cost')
plt.ylim(0, 0.1)
plt.legend()

In [None]:
yp = np.argmax(nn.predict(X_mnist_test), axis=1)
accuracy = np.sum(yp==y_mnist_test).astype(int)/len(yp)*100
print(f"accuracy = {accuracy:5.2f}")