<a href="https://colab.research.google.com/github/PiotrGrabysz/GSN/blob/master/Exercise_6_P3_Backpropagation_fully_vectorized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import random
import numpy as np
from torchvision import datasets, transforms

In [3]:
# Let's read the mnist dataset

def load_mnist(path='.'):
    train_set = datasets.MNIST(path, train=True, download=True)
    x_train = train_set.data.numpy()
    _y_train = train_set.targets.numpy()
    
    test_set = datasets.MNIST(path, train=False, download=True)
    x_test = test_set.data.numpy()
    _y_test = test_set.targets.numpy()
    
    x_train = x_train / 255.
    x_test = x_test / 255.

    y_train = np.zeros((_y_train.shape[0], 10))
    y_train[np.arange(_y_train.shape[0]), _y_train] = 1
    
    y_test = np.zeros((_y_test.shape[0], 10))
    y_test[np.arange(_y_test.shape[0]), _y_test] = 1

    return (x_train, y_train), (x_test, y_test)

(x_train, y_train), (x_test, y_test) = load_mnist()

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw
Processing...
Done!


In this exercise your task is to fill in the gaps in this code by implementing the backpropagation algorithm
Once this is done, you can run the network on the MNIST example and see how it performs. Feel free to play with the parameters.

If you found this task too easy, try to implement a "fully vectorized" version, i.e. one using matrix operations instead of going over examples one by one.

In [32]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    # Derivative of the sigmoid
    return sigmoid(z)*(1-sigmoid(z))

class Network(object):
    def __init__(self, sizes):
        # initialize biases and weights with random normal distr.
        # weights are indexed by target node first
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) 
                        for x, y in zip(sizes[:-1], sizes[1:])]
    def feedforward(self, a):
        # Run the network on a single case
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def update_mini_batch(self, x_mini_batch, y_mini_batch, eta):
        # Update networks weights and biases by applying a single step
        # of gradient descent using backpropagation to compute the gradient.
        # The gradient is computed for a mini_batch.
        # eta is the learning rate
        nabla_b, nabla_w = self.backprop(x_mini_batch.reshape(-1, 784).T, y_mini_batch.reshape(-1, 10).T)
        self.weights = [w-(eta/len(x_mini_batch))*nw 
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(x_mini_batch))*nb 
                       for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        # For a single input (x,y) return a tuple of lists.
        # First contains gradients over biases, second over weights.
        
        # First initialize the list of gradient arrays
        delta_nabla_b = [np.zeros_like(p) for p in self.biases]
        delta_nabla_w = [np.zeros_like(p) for p in self.weights]
        
        # Then go forward remembering all values before and after activations
        # in two other array lists
        z = [] #list of values before activation
        a = [x] #list of values after activation function
        for b, w in zip(self.biases, self.weights):
            z.append(np.dot(w, a[-1]) +b)
            a.append(sigmoid(z[-1]))  
        
        # Now go backward from the final cost applying backpropagation
        dC_da = self.cost_derivative(a[-1], y)
        for l in range(self.num_layers-2, -1, -1):
            delta_nabla_w[l] = (sigmoid_prime(z[l])*dC_da) @ a[l].T
            delta_nabla_b[l] = np.expand_dims(np.sum(sigmoid_prime(z[l])*dC_da,  axis = 1), axis = 1)
            dC_da = np.dot((self.weights[l]).T, sigmoid_prime(z[l])*dC_da)
            
        return delta_nabla_b, delta_nabla_w
        return delta_nabla_b, delta_nabla_w

    def evaluate(self, x_test_data, y_test_data):
        # Count the number of correct answers for test_data
        test_results = [(np.argmax(self.feedforward(x_test_data[i].reshape(784,1))), np.argmax(y_test_data[i]))
                        for i in range(len(x_test_data))]
        # return accuracy
        return np.mean([int(x == y) for (x, y) in test_results])
    
    def cost_derivative(self, output_activations, y):
        return (output_activations-y) 
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        x_train, y_train = training_data
        if test_data:
            x_test, y_test = test_data
        for j in range(epochs):
            for i in range(x_train.shape[0] // mini_batch_size):
                x_mini_batch = x_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                y_mini_batch = y_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                self.update_mini_batch(x_mini_batch, y_mini_batch, eta)
            if test_data:
                print("Epoch: {0}, Accuracy: {1}".format(j, self.evaluate(x_test, y_test)))
            else:
                print("Epoch: {0}".format(j))


network = Network([784,30,10])
network.SGD((x_train, y_train), epochs=50, mini_batch_size=100, eta=3., test_data=(x_test, y_test))



Epoch: 0, Accuracy: 0.7019
Epoch: 1, Accuracy: 0.7333
Epoch: 2, Accuracy: 0.7998
Epoch: 3, Accuracy: 0.8148
Epoch: 4, Accuracy: 0.8236
Epoch: 5, Accuracy: 0.8287
Epoch: 6, Accuracy: 0.8326
Epoch: 7, Accuracy: 0.8356
Epoch: 8, Accuracy: 0.9065
Epoch: 9, Accuracy: 0.9148
Epoch: 10, Accuracy: 0.9183
Epoch: 11, Accuracy: 0.9214
Epoch: 12, Accuracy: 0.9235
Epoch: 13, Accuracy: 0.9254
Epoch: 14, Accuracy: 0.9263
Epoch: 15, Accuracy: 0.9278
Epoch: 16, Accuracy: 0.9294
Epoch: 17, Accuracy: 0.9302
Epoch: 18, Accuracy: 0.9313
Epoch: 19, Accuracy: 0.9321
Epoch: 20, Accuracy: 0.9328
Epoch: 21, Accuracy: 0.9332
Epoch: 22, Accuracy: 0.9342
Epoch: 23, Accuracy: 0.9349
Epoch: 24, Accuracy: 0.9355
Epoch: 25, Accuracy: 0.9364
Epoch: 26, Accuracy: 0.9372
Epoch: 27, Accuracy: 0.9377
Epoch: 28, Accuracy: 0.9384
Epoch: 29, Accuracy: 0.9385
Epoch: 30, Accuracy: 0.9388
Epoch: 31, Accuracy: 0.939
Epoch: 32, Accuracy: 0.9394
Epoch: 33, Accuracy: 0.9389
Epoch: 34, Accuracy: 0.9388
Epoch: 35, Accuracy: 0.9392
Epo