# Exercise 3
## Introduction
We install PyTorch using the command `conda install pytorch`. Then we run the following cell.

In [15]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.optim

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d



mb_size = 100 # mini-batch size of 100


trans = transforms.Compose([transforms.ToTensor(),
                            transforms.Normalize((0.5,), (0.5,))])


dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)

test_dataset = dset.MNIST("./", download=True,
                          train=False,
                          transform = trans)


dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=mb_size,
                                          shuffle=True, num_workers=1,
                                          pin_memory=True)


def init_weights(shape):
    # Kaiming He initialization (a good initialization is important)
    # https://arxiv.org/abs/1502.01852
    std = np.sqrt(2. / shape[0])
    w = torch.randn(size=shape) * std
    w.requires_grad = True
    return w


def rectify(X):
    return torch.max(torch.zeros_like(X), X)


# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.5, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(grad, grad, value=1-alpha)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(grad, avg, value=-group['lr'])


def model(X, w_h, w_h2, w_o):
    h = rectify(X @ w_h)
    h2 = rectify(h @ w_h2)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [16]:
w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

optimizer = RMSprop([w_h, w_h2, w_o])


# put this into a training loop over 100 epochs
for i in range(101):
    print("Epoch: {}".format(i+1))
    avg_train_loss = 0.
    for (j, (X, y)) in enumerate(dataloader):
        noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o)
        optimizer.zero_grad()
        # the cross-entropy loss function already contains the softmax
        cost = torch.nn.functional.cross_entropy(noise_py_x, y, reduction="mean")
        avg_train_loss += cost
        cost.backward()
        optimizer.step()

    if i % 10 == 0:
        print("Average Train Loss: {}".format(avg_train_loss / (j + 1)))

        # no need to calculate gradients for validation
        with torch.no_grad():
            avg_test_loss = 0.
            for (k, (X, y)) in enumerate(test_loader):
                noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o)
                cost = torch.nn.functional.cross_entropy(noise_py_x, y, reduction="mean")
                avg_test_loss += cost

            print("Average Test Loss: {}".format(avg_test_loss / (k + 1)))




Epoch: 1
Average Train Loss: 0.40162765979766846
Average Test Loss: 0.2319663017988205
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Average Train Loss: 0.16438473761081696
Average Test Loss: 0.3202357888221741
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Average Train Loss: 0.09217559546232224
Average Test Loss: 0.3900783061981201
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Average Train Loss: 0.0667547807097435
Average Test Loss: 0.5949299931526184
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Average Train Loss: 0.049366917461156845
Average Test Loss: 0.7628595232963562
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Average Train Loss: 0.03868158161640167
Average Test Loss: 0.7551425695419312
Epoch: 52
Epoch: 53
Epoch: 54
E

## Dropout
In the following cell we implement the dropout function and a dropout model. Dropout helps the network to not "overestimate" the meaning of one single input: Because each channel will drop out at some time, the tends to learn information that do not depend on only one special input or combination of inputs. The training set can contain "random" similarities that have nothing to do with a property of the distribution we draw our instances from. Dropout helps to suppress the influence of these similarities. For evaluating the test loss, we do not apply dropout anymore (this would mean that we throw away information unnecessarily), but use the same model configuration without dropout.

In [17]:
def dropout(X, p_drop = .5):
    """
    Implements dropout.
    
    Sets entries with a probability of p_drop to zero.
    
    Parameters
    ----------
    X : A Tensor
    
    p_drop : dropout probability
    
    
    Returns
    -------
    A Tensor
        X after dropout.
    
    """
    
    if (p_drop > 0 and p_drop < 1):
        active = torch.tensor(np.random.binomial(1,1-p_drop,size=np.array(X.shape)))
        X *= active / (1-p_drop)
        return X
        
    else: 
        return X
    
    

def dropout_model(X, w_h, w_h2, w_o, p_drop_input=0.5, p_drop_hidden=0.5):
    X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    h = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [18]:
w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

optimizer = RMSprop([w_h, w_h2, w_o])


# put this into a training loop over 100 epochs
for i in range(101):
    print("Epoch: {}".format(i+1))
    avg_train_loss = 0.
    for (j, (X, y)) in enumerate(dataloader):
        noise_py_x = dropout_model(X.reshape(mb_size, 784), w_h, w_h2, w_o)
        optimizer.zero_grad()
        # the cross-entropy loss function already contains the softmax
        cost = torch.nn.functional.cross_entropy(noise_py_x, y, reduction="mean")
        avg_train_loss += cost
        cost.backward()
        optimizer.step()

    if i % 10 == 0:
        print("Average Train Loss: {}".format(avg_train_loss / (j + 1)))

        # no need to calculate gradients for validation
        with torch.no_grad():
            avg_test_loss = 0.
            for (k, (X, y)) in enumerate(test_loader):
                noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o)
                cost = torch.nn.functional.cross_entropy(noise_py_x, y, reduction="mean")
                avg_test_loss += cost

            print("Average Test Loss: {}".format(avg_test_loss / (k + 1)))

Epoch: 1
Average Train Loss: 1.0189917087554932
Average Test Loss: 0.32450130581855774
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Average Train Loss: 1.0233594179153442
Average Test Loss: 0.3054808974266052
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Average Train Loss: 1.2779194116592407
Average Test Loss: 0.36670178174972534
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Average Train Loss: 1.5280224084854126
Average Test Loss: 0.45627361536026
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Average Train Loss: 1.7229119539260864
Average Test Loss: 0.5553186535835266
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Average Train Loss: 1.8203853368759155
Average Test Loss: 0.656729519367218
Epoch: 52
Epoch: 53
Epoch: 54
Epoch: 5

Now we compare the test error from the dropout model and the model without dropout. In case of the dropout model our test loss starts (after epoch 1) with a higher value than the model without dropout. But the test loss increases much slower when you use the dropout model (avg test loss of 0.78 vs. 0.80 after 101 epochs).
## Parametric ReLU
In the next cell, we define the parametric ReLU activation function and add the parameters a into the params list. We define a PRelu model that uses the PRelu activation instead of Relu and uses dropout. For evaluating the test loss another Prelu model without dropout is defined.

In [11]:
def PRelu(X,a):
    return torch.max(torch.zeros_like(X), X) - torch.max(torch.zeros_like(X), -X) * a

def dropout_PRelu_model(X, w_h, w_h2, w_o, a_h, a_h2, p_drop_input=0.5, p_drop_hidden=0.5):
    X = dropout(X, p_drop_input)
    h = PRelu(X @ w_h, a_h)
    h = dropout(h, p_drop_hidden)
    h2 = PRelu(h @ w_h2, a_h2)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

def PRelu_model(X, w_h, w_h2, w_o, a_h, a_h2):
    h = PRelu(X @ w_h, a_h)
    h2 = PRelu(h @ w_h2, a_h2)
    pre_softmax = h2 @ w_o
    return pre_softmax           

In [12]:
w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

#initialize the weights for the PRelus as 0.25
a_h = torch.ones(625) * 0.25     
a_h.requires_grad = True
a_h2 = torch.ones(625) * 0.25
a_h2.requires_grad = True

#add a_h and a_h2 to the params list
optimizer = RMSprop([w_h, w_h2, w_o, a_h, a_h2])


# put this into a training loop over 100 epochs
for i in range(101):
    print("Epoch: {}".format(i+1))
    avg_train_loss = 0.
    for (j, (X, y)) in enumerate(dataloader):
        noise_py_x = dropout_PRelu_model(X.reshape(mb_size, 784), w_h, w_h2, w_o, a_h, a_h2)
        optimizer.zero_grad()
        # the cross-entropy loss function already contains the softmax
        cost = torch.nn.functional.cross_entropy(noise_py_x, y, reduction="mean")
        avg_train_loss += cost
        cost.backward()
        optimizer.step()

    if i % 10 == 0:
        print("Average Train Loss: {}".format(avg_train_loss / (j + 1)))

        # no need to calculate gradients for validation
        with torch.no_grad():
            avg_test_loss = 0.
            for (k, (X, y)) in enumerate(test_loader):
                noise_py_x = PRelu_model(X.reshape(mb_size, 784), w_h, w_h2, w_o, a_h, a_h2)
                cost = torch.nn.functional.cross_entropy(noise_py_x, y, reduction="mean")
                avg_test_loss += cost

            print("Average Test Loss: {}".format(avg_test_loss / (k + 1)))



Epoch: 1
Average Train Loss: 1.0492351055145264
Average Test Loss: 0.3687511384487152
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Average Train Loss: 0.516887366771698
Average Test Loss: 0.1784283071756363
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Average Train Loss: 0.5752396583557129
Average Test Loss: 0.18248169124126434
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Average Train Loss: 0.5795792937278748
Average Test Loss: 0.18883413076400757
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Average Train Loss: 0.5838701725006104
Average Test Loss: 0.15499158203601837
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Average Train Loss: 0.5781242251396179
Average Test Loss: 0.1776753067970276
Epoch: 52
Epoch: 53
Epoch: 54
Epoch

We compare the results to the Relu model and the dropout Relu model. Similar to the dropout Relu model the test loss is larger for the dropout PRelu model than for the Relu model without dropout. Another similarity is that the dropout Relu model and the dropout PRelu model have a test loss which is increasing slow with the number of epochs (compared to the Relu model without dropout). But we can see that the PReLU model decreases its test loss in the beginning for more than ten epochs. This is something new compared to all the models before. The other models did not show any decrease of the test error (when you display the test loss after steps of 10 epochs).
## Convolutional layers

In [14]:
from torch.nn.functional import conv2d, max_pool2d
