In [None]:
!pip install pycm livelossplot
%pylab inline

# ACSE Module 8 - Practical - Morning Session 3:
----
# Training Deep Neural Networks - Crossvalidation

## Task 1: Training a deep neural network on MNIST using Pytorch

In this exercise you will use cross-validation to estimate the hyperparameters of a deep-neural network trained on the MNIST dataset and create predictions on the MNIST datasets public test set.

#### A few imports before we get started

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit

from livelossplot import PlotLosses
from pycm import *

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import MNIST


def set_seed(seed):
    """
    Use this to set ALL the random seeds to a fixed value and take out any randomness from cuda kernels
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.benchmark = False  ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. -
    torch.backends.cudnn.enabled   = False

    return True

device = 'cpu'
if torch.cuda.device_count() > 0 and torch.cuda.is_available():
    print("Cuda installed! Running on GPU!")
    device = 'cuda'
else:
    print("No GPU available!")

## 1.1: Mounting the google drive for later storage

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

## 1.2: The MNIST Dataset - Hello World of Deep-Learning

In [None]:
mnist_train = MNIST("./", download=True, train=True)
mnist_test = MNIST("./", download=True, train=False)

## 1.3: Instantiate and create a ```StratifiedShuffleSplit``` using sklearn.
1. Create a ```sklearn.model_selection.StratifiedShuffleSplit``` object with 1-split and a test-size of 10%.
2. Get the training and validation indices from the shuffel-split 

## 1.4: Splitting and normalize the data
The original mnist data is given in gray-scale values between 0 and 255.
You will need to write a normalisation method that takes in a ```torch.Tensor``` and performs normalisation.
The mean of MNIST is 0.1307 and it's standard deviation is 0.3081 (after division by 255).

Finally, torch likes all categorical data to be in a ```.long()``` format.
Therefore


## 1.5: Instantiate a ```torch.utils.data.TensorDataset``` for training, validation and test data

Let's visualise an example of the images and check whether the data is normalised properly (compute .mean() and .std() on the training set.)

## 1.6: Create a SimpleNet as a ```nn.Module```
Create a simple feed-forward neural network with the following architecture:
- Input Layer: 28*28 neurons
- Hidden Layer: 25 neurons
- Output Layer: 10 neurons
- Hidden Layer Activation: ReLU
- Output Layer Activation: None


## 1.7: Sanity checks on our SimpleNet


In [None]:
model = SimpleNet().to(device)
criterion = nn.CrossEntropyLoss()

X_ = X_train[0].view(-1, 28*28).to(device)
y_ = torch.zeros((1)).to(device).long()

a2 = model(X_)
loss = criterion(a2, y_)

y_pred = F.log_softmax(a2, dim=1).max(1)[1]

print(F.log_softmax(a2, dim=1))
print(y_pred)

## 1.8: Create simple train function

The function should perform the following tasks:
1. Set the model into training mode
2. Iterate over all the mini-batches
3. Send the batches to the GPU / CPU
4. Zero all the gradients
5. Perform the forward-pass
6. Compute the loss
7. Perform the backward-pass
8. Keep a running measure of training loss and accuracy
9. Perform a step of gradient-descent
10. Once done with all batches, return average training loss and accuracy

In [None]:
def train(model, optimizer, criterion, data_loader):
    model.train()
    train_loss, train_accuracy = 0, 0
    for X, y in data_loader:
        X, y = X.to(device), y.to(device)  
        
    return train_loss/len(data_loader.dataset), train_accuracy/len(data_loader.dataset)

## 1.9 Create simple validate function

The function should perform the following tasks:
1. Set the model into evaluation mode
2. Iterate over all the mini-batches
3. Send the batches to the GPU / CPU
5. Perform the forward-pass
6. Compute the loss
8. Keep a running measure of validation loss and accuracy
10. Once done with all batches, return average validation loss and accuracy

In [None]:
def validate(model, criterion, data_loader):
    model.eval()
    validation_loss, validation_accuracy = 0., 0.
    for X, y in data_loader:
        with torch.no_grad():
            X, y = X.to(device), y.to(device)
            
    return validation_loss/len(data_loader.dataset), validation_accuracy/len(data_loader.dataset)

 ## 1.10: Set the hyperparameters of your model
- Seed: 42
- learning rate: 1e-2
- Optimizer: SGD
- momentum: 0.9
- Number of Epochs: 30
- Batchsize: 64
- Test Batch Size (no effect on training apart from time): 1000
- Shuffle the training set every epoch: Yes

In [None]:
seed = 42
lr = 1e-2
momentum = 0.9
batch_size = 64
test_batch_size = 1000
n_epochs = 30

## 1.11: Instantiate our model, optimizer and loss function
Set the random number generator seed using ```set_seed``` to make everything reproducible.
As a criterion use a sensible loss for the multi-class classification problem.

## 1.12: Create dataloaders for the training, validation and test data

## 1.13: Perform the training of the network and validation
Here we provide you with a method to visualize both training and validation loss while training your networks.

In [None]:
liveloss = PlotLosses()
for epoch in range(30):
    logs = {}
    train_loss, train_accuracy = train(model, optimizer, criterion, train_loader)

    logs['' + 'log loss'] = train_loss.item()
    logs['' + 'accuracy'] = train_accuracy.item()
    
    validation_loss, validation_accuracy = validate(model, criterion, validation_loader)
    logs['val_' + 'log loss'] = validation_loss.item()
    logs['val_' + 'accuracy'] = validation_accuracy.item()
    
    liveloss.update(logs)
    liveloss.draw()

It seems the model isn't doing very well. Maybe we can do better.

## 1.14: Running a grid-search
Run a grid search over the momentum value, use the following: momentum = [0.1, 0.5, 0.9]
and run the model on the full training set with the best value for the momentum parameter.

In [None]:
def train_model(momentum):
  set_seed(seed)
  model = SimpleNet().to(device)
  optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
  criterion = nn.CrossEntropyLoss()
  
  train_loader = DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=0)
  validation_loader = DataLoader(mnist_validate, batch_size=test_batch_size, shuffle=False, num_workers=0)
  test_loader = DataLoader(mnist_test, batch_size=test_batch_size, shuffle=False, num_workers=0)
  
  liveloss = PlotLosses()
  for epoch in range(30):
      logs = {}
      train_loss, train_accuracy = train(model, optimizer, criterion, train_loader)

      logs['' + 'log loss'] = train_loss.item()
      logs['' + 'accuracy'] = train_accuracy.item()

      validation_loss, validation_accuracy = validate(model, criterion, validation_loader)
      logs['val_' + 'log loss'] = validation_loss.item()
      logs['val_' + 'accuracy'] = validation_accuracy.item()

      liveloss.update(logs)
      liveloss.draw()
      
  return True

In [None]:
train_model(0.1)

In [None]:
train_model(0.5)

In [None]:
train_model(0.9)

## 1.15: Implement an evaluate method
This method performs the same as validate but doesn't report losses, but simply returns all predictions on a given dataset (training, validation, test-set)

In [None]:
def evaluate(model, data_loader):
    model.eval()
    ys, y_preds = [], []
    for X, y in data_loader:
        with torch.no_grad():
            X, y = X.to(device), y.to(device)
            
    return np.concatenate(y_preds, 0),  np.concatenate(ys, 0)

y_pred, y_gt = evaluate(model, validation_loader)

## 1.17: Plotting a confusion matrix

We can use a confusion matrix to diagnose problems in our models.
We may see for example that our model confuses 9's for 4's quite often.

In [None]:
cm = ConfusionMatrix(actual_vector=y_gt, predict_vector=y_pred) # Create CM From Data
print(cm)

## 1.18: Given that you estimated your hyperparameters, train your model on the full dataset and evaluate on the test set

## 2.1: Storing and loading models - Pytorch State-Dicts

Pytorch stores all the parameters of models and optimizers, their weights and biases in an easy to read dictionary called a "state-dict".
When we store models and optimizers, we store the state-dict.  
Together with the model definition we can then restore the model to it's state when we stored it to disk.
Let's look at the contents of the state-dict of both our optimizer and our model:

In [None]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

## 2.2: Storing models to disk
From colab (and locally) we can store models to disk using ```torch.save``` and passing both a models state_dict() and a path where to store it.

In [None]:
model_save_name = 'simplenet_mnist_classifier.pt'
path = F"/content/gdrive/My Drive/models/{model_save_name}" 
torch.save(model.state_dict(), path)

## 2.3: Loading models from checkpoints
Finally, we can restore models from the saved ```state_dict```'s and do a number of things such as:
1. Continue training (given we stored the optimizer as well)
2. Make predictions from our model
3. Perform inspections of our model
4. Use our model in ensembles 
5. ...

By default a loaded model is put into ```.train()``` mode. So be careful when using networks that behave different depending on training and test time e.g. dropout regularized networks or batch-normalized networks.

In [None]:
model = SimpleNet().to(device)
model.load_state_dict(torch.load(path))
model.eval()

test_loss, test_accuracy = validate(model, criterion, test_loader)    
print("Avg. Test Loss: %1.3f" % test_loss.item(), " Avg. Test Accuracy: %1.3f" % test_accuracy.item())
print("")

Our model performs exactly the same as before storing it to disk.