# Computational Cognitive Neuroscience Practical Assignment 2
## Training an MLP on MNIST
### Tommy Clausner (s4836219) and Steven Smits (s4237263)

In [None]:
from __future__ import print_function
import numpy as np
import chainer
from chainer.functions.evaluation import accuracy
from chainer.functions.loss import softmax_cross_entropy
from chainer import link
from chainer import reporter
from chainer import optimizers
import chainer.functions as F
import chainer.links as L
from chainer.datasets import TupleDataset
import matplotlib.pyplot as plt

## Before the sub assignments, we first define all given functions to be used.
Since this week n_train, batchsize, epochs etc were NOT specified, we take full freedom in changing these parameters (in the convolutional network). 

In [None]:
def get_mnist(n_train=500, n_test=100, n_dim=1, with_label=True, classes = None):
    """

    :param n_train: nr of training examples per class
    :param n_test: nr of test examples per class
    :param n_dim: 1 or 3 (for convolutional input)
    :param with_label: whether or not to also provide labels
    :param classes: if not None, then it selects only those classes, e.g. [0, 1]
    :return:
    """

    train_data, test_data = chainer.datasets.get_mnist(ndim=n_dim, withlabel=with_label)

    if not classes:
        classes = np.arange(10)
    n_classes = len(classes)

    if with_label:

        for d in range(2):

            if d==0:
                data = train_data._datasets[0]
                labels = train_data._datasets[1]
                n = n_train
            else:
                data = test_data._datasets[0]
                labels = test_data._datasets[1]
                n = n_test

            for i in range(n_classes):
                lidx = np.where(labels == classes[i])[0][:n]
                if i==0:
                    idx = lidx
                else:
                    idx = np.hstack([idx,lidx])

            L = np.concatenate([i*np.ones(n) for i in np.arange(n_classes)]).astype('int32')

            if d==0:
                train_data = TupleDataset(data[idx],L)
            else:
                test_data = TupleDataset(data[idx],L)

    else:

        tmp1, tmp2 = chainer.datasets.get_mnist(ndim=n_dim,withlabel=True)

        for d in range(2):

            if d == 0:
                data = train_data
                labels = tmp1._datasets[1]
                n = n_train
            else:
                data = test_data
                labels = tmp2._datasets[1]
                n = n_test

            for i in range(n_classes):
                lidx = np.where(labels == classes[i])[0][:n]
                if i == 0:
                    idx = lidx
                else:
                    idx = np.hstack([idx, lidx])

            if d == 0:
                train_data = data[idx]
            else:
                test_data = data[idx]

    return train_data, test_data

class RandomIterator(object):
    """
    Generates random subsets of data
    """

    def __init__(self, data, batch_size=1):
        """

        Args:
            data (TupleDataset):
            batch_size (int):

        Returns:
            list of batches consisting of (input, output) pairs
        """

        self.data = data

        self.batch_size = batch_size
        self.n_batches = len(self.data) // batch_size

    def __iter__(self):

        self.idx = -1
        self._order = np.random.permutation(len(self.data))[:(self.n_batches * self.batch_size)]

        return self

    def next(self):

        self.idx += 1

        if self.idx == self.n_batches:
            raise StopIteration

        i = self.idx * self.batch_size

        # handles unlabeled and labeled data
        if isinstance(self.data, np.ndarray):
            return self.data[self._order[i:(i + self.batch_size)]]
        else:
            return list(self.data[self._order[i:(i + self.batch_size)]])

class Classifier(link.Chain):

    """A simple classifier model.
    This is an example of chain that wraps another chain. It computes the
    loss and accuracy based on a given input/label pair.
    Args:
        predictor (~chainer.Link): Predictor network.
        lossfun (function): Loss function.
        accfun (function): Function that computes accuracy.
    Attributes:
        predictor (~chainer.Link): Predictor network.
        lossfun (function): Loss function.
        accfun (function): Function that computes accuracy.
        y (~chainer.Variable): Prediction for the last minibatch.
        loss (~chainer.Variable): Loss value for the last minibatch.
        accuracy (~chainer.Variable): Accuracy for the last minibatch.
        compute_accuracy (bool): If ``True``, compute accuracy on the forward
            computation. The default value is ``True``.
    """

    compute_accuracy = True

    def __init__(self, predictor,
                 lossfun=softmax_cross_entropy.softmax_cross_entropy,
                 accfun=accuracy.accuracy):
        super(Classifier, self).__init__()
        self.lossfun = lossfun
        self.accfun = accfun
        self.y = None
        self.loss = None
        self.accuracy = None

        with self.init_scope():
            self.predictor = predictor

    def __call__(self, *args):
        """Computes the loss value for an input and label pair.
        It also computes accuracy and stores it to the attribute.
        Args:
            args (list of ~chainer.Variable): Input minibatch.
        The all elements of ``args`` but last one are features and
        the last element corresponds to ground truth labels.
        It feeds features to the predictor and compare the result
        with ground truth labels.
        Returns:
            ~chainer.Variable: Loss value.
        """

        assert len(args) >= 2
        x = args[:-1]
        t = args[-1]
        self.y = None
        self.loss = None
        self.accuracy = None
        self.y = self.predictor(*x)
        self.loss = self.lossfun(self.y, t)
        reporter.report({'loss': self.loss}, self)
        if self.compute_accuracy:
            self.accuracy = self.accfun(self.y, t)
            reporter.report({'accuracy': self.accuracy}, self)
        return self.loss

###### Train a deep neural network for 20 epochs consisting of N fully connected layers and 10 units per layer. Compare the performance on training and validation data using networks consisting of N=1, 2 and 3 layers (note: a two-layer model would be equivalent to an MLP). Visualize and interpret the results. Report your conclusions.

The perceptron (N=1) seems to do best of all three networks. Both the MLP N=2 and N=3 networks perform similar, even after several attempts. Thus, adding more layers in such a simple model doesn't provide a benefit in the current context. 

![figure1](http://preview.ibb.co/muYTdw/figure_1.png)
![figure2](http://preview.ibb.co/bHPqrG/figure_2.png)
![figure3](http://preview.ibb.co/b3VVrG/figure_3.png)


###### Create a network consisting of a convolutional layer, a max pooling layer and one fully connected layer. For the convolutional layers, use 5 output channels, a kernel size of 5, stride of 1 and padding of 0. Again plot the loss. Report your conclusions. 

The Convolutional layer learns quicker and converges at a higher accuracy than the perceptron and "deep" networks. It's also a lot slower than the other networks, which begs the question if there is any benefit using a convolutional layer for such simple data as the MNIST. 

![figure4](http://preview.ibb.co/gqmQQb/figure_4.png)

###### Read the Chainer documentation. Add additional components to your model (e.g. one of dropout, batch normalization, other activation functions, etc.). Report if your new architecture outperforms the original convnet architecture. Provide a plot and a written explanation of your observed (better/worse) results.

Dropout was added to the training algorithm. This augmentation is used to reduce overfitting to the data. It works by preventing complex co-adaptations on the training data.  
In reality, adding the dropout results in a slight accuracy increase. 

Additionally weight decay was added to the training algorithm. This augmentation prevents convergence to local minima

![figure5](http://preview.ibb.co/bVjZJw/figure_5.png)
![figure6](http://preview.ibb.co/fPOGWG/figure_6.png)
![figure7](http://preview.ibb.co/iywEJw/figure_7.png)

In the next cell, we provide the code. First, the MLP (no, 1, 2 hidden layers), convolutional networks (.0, .2, .5 dropout rate) and an additional convolutional network (.5 dropout rate + .0005 weight decay) are defined and then the algorithm that performs it is executed.

In [None]:
class MLP(chainer.Chain):
    """Multilayered Perceptron with 1 hidden layer. It takes n_units as input
        , which is the number of hidden layer units. In addition, it takes n_out as input, specifying output unit of
        last layer, this is 10 because we're working with 10 possible [True/False] outcomes."""

    def __init__(self, n_units, n_out,numl):
        super(MLP, self).__init__(
            # No need for input number, it can infer this.
            l1=L.Linear(None, n_units),  # Input to layer 1
            l2=L.Linear(None, n_units),  # Input to layer 2
            lo=L.Linear(None, n_units),  # Layer out
        )
        self.numlayers=numl

    def __call__(self, x):
        numlayers=self.numlayers
        if numlayers>2:
            x2 = F.relu(self.l1(x))
            x3 = F.relu(self.l2(x2))
            y = self.lo(x3)
        elif numlayers>1:
            x2 = F.relu(self.l1(x))
            y = self.lo(x2)
        else:
            y = self.lo(x)
        return y

class MLPConv(chainer.Chain):
    def __init__(self):
        super(MLPConv, self).__init__()
        with self.init_scope():
            self.l1 = L.Convolution2D(in_channels=1, out_channels=5, ksize=5, stride=1, pad=0)
            self.l2 = L.Linear(None, 10)

    def __call__(self, x):
        x2 = F.relu(self.l1(x))
        x3 = F.max_pooling_2d(x2, 2, 2)
        y = F.softmax(self.l2(x3))
        return y

def DoMLP():
    """ Algorithm that implements the Multilayered Perceptron using chainer. Takes no input. Trains MLP on MNIST data obtained
        from chainer.It calculates the loss and accuracy using the classifier, which it automatically prints per epoch and plots
        when the MLP is done. Furthermore a convolutionary network is implemented as well including variouse dropout rates."""
    # Stable factors specification
    batchsize = 32  # Training batchsize, blackboard specified 32
    noise=[0.2,0.5]
    for num_layers in range(1, 8): # this is not only used to vary the number of hidden layers in the MLP, but also to switch between different conditions for the convolutionary network
        if num_layers > 3: # if convolutionary network
            epoch = 50 # for the convolutionary network 50 training epochs are used
        else:
            epoch = 20  # Training epochs, blackboard specified 20
        unit = 10  # Hidden layer units, blackboard specified 10
        # Our model of the neural network
        if num_layers > 3:  # if convolutionary network
            model = MLPConv()
        else: # if  not convolutionary network
            model = MLP(unit, 10, num_layers)



        # Classifier that calculates the loss and accuracy of the model
        classifier_model = Classifier(model)

        # Setup an optimizer
        optimizer = optimizers.SGD()  # Using Stochastic Gradient Descent
        optimizer.setup(classifier_model)
        if num_layers>6:
            optimizer.add_hook(chainer.optimizer.WeightDecay(0.0005)) # Adding additional component to our model: Weight Decay

        # Load the MNIST dataset
        train, test = get_mnist()  # Get mnist data.

        n_epoch = epoch
        N = len(train)  # training data size
        N_test = len(test)  # test data size

        accplot = np.zeros((n_epoch, 1), dtype=float)  # Store  test accuracy for plot
        lossplot = np.zeros((n_epoch, 1), dtype=float)  # Store test loss for plot

        accplot_train = np.zeros((n_epoch, 1), dtype=float) # Store train accuracy for plot
        lossplot_train = np.zeros((n_epoch, 1), dtype=float)  # Store train loss for plot

        # Training the MLP for epochs
        for epoch in range(1, n_epoch + 1):  # start with epoch 1 (instead of 0)
            print('epoch ', epoch) # prompting the word 'epoch ' and the coresponding training epoch to the Python Consol

            # training the MLP with the last chainer method from guide; no cleargrads()!
            perm = np.random.permutation(N)  # Including extra random permutation for resulted in higher accuracy than only RandomIterator.
            Currbatchtrain = RandomIterator(train) # randomize order of training samples

            sum_accuracy_train = 0  # Creating a staring variable
            sum_loss_train = 0
            for i in range(0, N, batchsize):
                input = chainer.Variable(np.asarray(Currbatchtrain.data[perm[i:i + batchsize]][0]))
                if num_layers > 3: # convolutionary network without dropout
                    input = np.asarray(Currbatchtrain.data[perm[i:i + batchsize]][0])
                    input = chainer.Variable(input.reshape(input.shape[0], 1, 28, 28)) # change input space to fit the convolutionary layer
                elif num_layers > 4: # convolutionary network with dropout
                    input = np.asarray(Currbatchtrain.data[perm[i:i + batchsize]][0])
                    input = chainer.Variable(input.reshape(input.shape[0], 1, 28, 28)) # change input space to fit the convolutionary layer
                    input = chainer.functions.dropout(input, ratio=noise[num_layers-5]) # add two different levels of dropout (0.2, 0.5) (Adding additional component to our model)

                target = chainer.Variable(np.asarray(Currbatchtrain.data[perm[i:i + batchsize]][1]))

                optimizer.update(classifier_model, input, target)  # Update the model using the classifier.

                sum_loss_train += float(classifier_model.loss.data) * len(target.data)  # Times length of current batch for relative impact
                sum_accuracy_train += float(classifier_model.accuracy.data) * len(target.data)

            print('mean loss =', (sum_loss_train / N), ', Accuracy =', (sum_accuracy_train / N))  # To check values during process.

            # Testing the model
            sum_accuracy = 0  # Creating a staring variable
            sum_loss = 0
            perm = np.random.permutation(N_test) # permutation for the indices
            for i in range(0, N_test, batchsize):
                if num_layers > 3: # if convolutionary network
                    input = np.asarray(test[perm[i:i + batchsize]][0])
                    input = chainer.Variable(input.reshape(input.shape[0], 1, 28, 28))
                else: # if not convolutionary network
                    input = chainer.Variable(test[perm[i:i + batchsize]][0])  # No random iteration, because it's just a test
                target = chainer.Variable(test[perm[i:i + batchsize]][1])

                loss = classifier_model(input, target) # calculating loss for the test samples over which we randomized beforehand according to a neuronal network model that was trained beforehand using the training data 'train' using chainer (a Python toolbox used to create neuronal network models in Python - did you seriousely read this?)

                sum_loss += float(loss.data) * len(target.data)  # Times length of current batch for relative impact
                sum_accuracy += float(classifier_model.accuracy.data) * len(target.data)
            print('mean loss =', (sum_loss / N_test), ', Accuracy =', (sum_accuracy / N_test))  # To check values during process.

            accplot[epoch - 1] = sum_accuracy / N_test # averaging values
            lossplot[epoch - 1] = sum_loss / N_test

            accplot_train[epoch - 1] = sum_accuracy_train / N # Correcting my mistake from last week, where this was N_test
            lossplot_train[epoch - 1] = sum_loss_train / N

        # Plot the accuracy and loss at the end per epoch
        fig, ax1 = plt.subplots()
        ax2 = ax1.twinx()

        ax1.plot(range(1, n_epoch + 1), accplot, 'r:', range(1, n_epoch + 1), accplot_train, 'r-',
                 label='Training set accuracy')
        ax1.set_xlabel('Training Epoch')
        ax1.set_xticks(range(1, n_epoch + 1))
        ax1.set_ylabel('Accuracy')
        ax1.tick_params(axis='y', colors='red')
        ax2.plot(range(1, n_epoch + 1), lossplot, 'b:', range(1, n_epoch + 1), lossplot_train, 'b-',
                 label='Training set loss')
        ax2.set_ylabel('Loss')
        ax2.tick_params(axis='y', colors='blue')
        ax2.legend(['Test set loss', 'Training set loss'], loc=2)
        ax1.legend(['Test set accuracy', 'Training set accuracy'], loc=3)
        # a different title for each model was created
        if num_layers > 6:
            plt.title('CCNS - Assignment 2: Conv Layer MLP + dropout rate .5 + weight decay (.0005)- accuracy and loss')
            ax1.set_xticks(range(10, n_epoch + 1,10))
        elif num_layers > 5:
            plt.title('CCNS - Assignment 2: Conv Layer MLP + dropout rate .5 - accuracy and loss')
            ax1.set_xticks(range(10, n_epoch + 1,10))
        elif num_layers > 4:
            plt.title('CCNS - Assignment 2: Conv Layer MLP + dropout rate .2 - accuracy and loss')
            ax1.set_xticks(range(10, n_epoch + 1,10))
        elif num_layers>3:
            plt.title('CCNS - Assignment 2: Conv Layer MLP without dropout - accuracy and loss')
            ax1.set_xticks(range(10, n_epoch + 1,10))
        else:
            plt.title('CCNS - Assignment 2: N = '+str(num_layers)+ ' MLP - accuracy and loss')
        plt.show() #plotting
        # end of mlp function assignment.
        # ok now for real

DoMLP()

###### Explain in which ways convolution is biologically plausible and biologically implausible.

The convolutional network may mimic the way the visual stream is organized throughout the cortex. To elucidate, the neural network has several receptive fields which pass on information to the next layer containing of receptive fields. Similarly, the early visual cortex has receptive fields which are sensitive to very specific input (e.g. orientation, shape). This information is then passed on to higher receptive fields integrating this information for another sensitivity (e.g. angle, object). Lastly, information about what a visual input actually is, is represented in the temporal lobe. Here, it has been shown that even single neurons may represent an entire object. Synaptic strength are represented by the weights in the convolutional network. Thus, the convolutional network and visual stream have some structure in common. However, it doesn't seem likely that higher receptive fields (such as in CovNet) all of a sudden are fully connected to any possible representation of object in the brain. It is more likely that one receptive field may only be connected to the target that represents the final object. 