In [181]:
import numpy as np
import matplotlib.pyplot as plt

from datetime import timedelta
from time import time
from os import mkdir
from os.path import isfile, isdir

import torch
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader

import importlib
import models.convnets as convnets

# Force reload of models module
importlib.reload(convnets)

# Seed for reproducability
torch.manual_seed(123)
torch.cuda.manual_seed(123)

# Device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [182]:
# Directory to store downloaded datasets
DOWNLOAD_DIRECTORY = "./datasets"

# Directory to store trained models
TRAINED_DIRECTORY = "./models/trained"

# Ensure that trained directory exists (prevents pytorch throwing error)
if not isdir(TRAINED_DIRECTORY):
    mkdir(TRAINED_DIRECTORY)

# Directory to store optimal, chosen models
MODEL_DIRECTORY = "./models"

### Import and load the datasets.

From [data_exploration.ipynb](./data_exploration.ipynb), we have gathered that the given training data has a mean of ~0.1307 and a standard deviation of ~0.3081. We use this information to normalize the data on import. This feature normalization is step is important when doing machine learning, but we have to be consequent and use these normalization values for all.

In [183]:
# Specify transformations on load
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(0.1307, 0.3081)
])

dataset_train_val = datasets.MNIST(root=DOWNLOAD_DIRECTORY, train=True, download=True, transform=transform)
dataset_test = datasets.MNIST(root=DOWNLOAD_DIRECTORY, train=False, download=True, transform=transform)

In [184]:
# Split dataset into training and validation sets
train_val_split = 5.0/6.0
train_size = int(train_val_split * len(dataset_train_val))
val_size = len(dataset_train_val) - train_size
dataset_train, dataset_val = random_split(dataset_train_val, [train_size, val_size])

batch_size = 512
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

### Define a training function

In [185]:
def train(model, n_epochs, optimizer, loss_fn, train_loader, val_loader=None, device="cpu"):
    """ Training function for a model, supports addition of a validation loader to extract intermediary validation loss. """
    # Store intermediary training and validation losses
    train_losses, val_losses = [], []
    start_time = time()

    model.train()
    optimizer.zero_grad(set_to_none = True)

    if torch.cuda.is_available() and device == "cuda":
        model.cuda()
    
    print(" ======== Training", model._get_name(), "======= ")

    for i in range(1, n_epochs + 1):
        train_loss = 0
        for imgs, labels in train_loader:
            labels = labels.to(device=device)
            output = model(imgs.to(device=device))

            # Calculate mean batch loss and perform backward pass
            loss = loss_fn(output, labels).mean()
            loss.backward()

            # Update parameters and zero out the gradients
            optimizer.step()
            optimizer.zero_grad()

            # Add loss to stored training loss
            train_loss += loss.item()

        # If we have a validation loader, store validation loss aswell
        val_loss = 0
        if val_loader:
            # Set to eval mode
            model.eval()

            # Calculate epoch's loss over validation loader
            for imgs, labels in val_loader:
                output = model(imgs.to(device=device))
                loss = loss_fn(output, labels.to(device=device)).mean()

                val_loss += loss.item()

            # Set back to training mode
            model.train()

        # Get average epoch loss for each type
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)

        # Print epoch loss first 3 epochs and every 5 afterwards
        if i <= 3 or i % 5 == 0:
            print(timedelta(seconds=round(time() - start_time)), "| Epoch", i, "| Training loss %.5f" %train_loss, ("| Validation loss %.5f" %val_loss) if val_loader else "")

        # Store this epoch's losses
        train_losses.append(train_loss)
        val_losses.append(val_loss)
    
    return train_losses, val_losses if val_loader else None

### Initialize and train different model architectures capable of doing image classification.

In [186]:
n_epochs = 25
loss_fn = nn.CrossEntropyLoss()

# Adam hyperparameters
optimizer_hyperparameters = [
    { "lr" : 0.001 },
    { "lr" : 0.001, "weight_decay": 0.0001 },
    { "lr" : 0.005 },
    { "lr" : 0.005, "weight_decay": 0.0001 },
    { "lr" : 0.01 },
    { "lr" : 0.01, "weight_decay": 0.0001 },
]

# Different model architectures
architechtures = [convnets.ConvNet1, convnets.ConvNet2, convnets.ConvNet3]

# Perform model selection, storing intermediate losses
losses = []
models = []
for i, hyperparameters in enumerate(optimizer_hyperparameters):
    for architecture in architechtures:
        # Define the model and optimizer
        model = architecture()
        optimizer = optim.Adam(model.parameters(), **hyperparameters)

        # Path to store model under
        model_path = TRAINED_DIRECTORY + "/" + model._get_name() + "_" + str(i+1)

        # Check if the model is already saved to disk, if so load it and its loss values
        if isfile(model_path + ".pt"):
            model_statedict = torch.load(model_path + ".pt", map_location=torch.device(DEVICE))
            model_losses = torch.load(model_path + ".loss", map_location=torch.device(DEVICE))

            model.load_state_dict(model_statedict)

            losses.append((model_losses["train"], model_losses["val"]))
            models.append(model)

            print(" ======== Loaded", model._get_name(), "======= ")
            print("Final Training loss %.5f" %model_losses["train"][-1], ("| Final Validation loss %.5f" %model_losses["val"][-1]))
        # If not, train and save it
        else:
            # Train the model
            train_losses, val_losses = train(model, n_epochs, optimizer, loss_fn, train_loader, val_loader, device=DEVICE)

            # Store intermediary losses and model
            losses.append((train_losses, val_losses))
            models.append(model)

            # Also save the model and loss values to disk
            torch.save(model.state_dict(), model_path + ".pt")
            torch.save({"train" : train_losses, "val" : val_losses}, model_path + ".loss")

Final Training loss 0.00988 | Final Validation loss 0.06396
Final Training loss 0.00387 | Final Validation loss 0.03580
Final Training loss 0.00817 | Final Validation loss 0.04010
Final Training loss 0.01261 | Final Validation loss 0.05308
Final Training loss 0.00585 | Final Validation loss 0.03185
Final Training loss 0.00955 | Final Validation loss 0.03491
Final Training loss 0.01456 | Final Validation loss 0.06423
Final Training loss 0.00590 | Final Validation loss 0.06308
Final Training loss 0.00730 | Final Validation loss 0.06081
Final Training loss 0.01596 | Final Validation loss 0.08247
Final Training loss 0.00807 | Final Validation loss 0.04399
Final Training loss 0.00717 | Final Validation loss 0.04159
Final Training loss 0.02672 | Final Validation loss 0.06543
Final Training loss 0.02033 | Final Validation loss 0.08174
Final Training loss 0.01715 | Final Validation loss 0.07619
Final Training loss 0.03066 | Final Validation loss 0.07766
Final Training loss 0.01999 | Final Vali

### Define a function for computing model accuracy

In [187]:
def compute_accuracy(model, data_loader, device="cpu"):
    # Store accuracy
    total, correct = 0, 0

    # Set the model to eval mode
    model.eval()

    if torch.cuda.is_available() and device == "cuda":
        model.cuda()
    
    # Disable autograd while evaluating the model
    with torch.no_grad():
        for imgs, labels in data_loader:
            labels = labels.to(device=device)
            output = model(imgs.to(device=device))

            total += len(labels)
            correct += sum(labels == torch.argmax(output, dim=1))
    
    # Return the computed accuracy
    return correct / total

### Compute the accuracy of each model with regards to the validation dataset

In [188]:
accuracies = []
for i, model in enumerate(models):
    accuracy = compute_accuracy(model, val_loader, device=DEVICE)
    accuracies.append(accuracy)

    print(model._get_name(), optimizer_hyperparameters[i // len(architechtures)], "had a validation accuracy of %.3f" %accuracy)

ConvNet1 {'lr': 0.001} had a validation accuracy of 0.986
ConvNet2 {'lr': 0.001} had a validation accuracy of 0.991
ConvNet3 {'lr': 0.001} had a validation accuracy of 0.988
ConvNet1 {'lr': 0.001, 'weight_decay': 0.0001} had a validation accuracy of 0.985
ConvNet2 {'lr': 0.001, 'weight_decay': 0.0001} had a validation accuracy of 0.991
ConvNet3 {'lr': 0.001, 'weight_decay': 0.0001} had a validation accuracy of 0.990
ConvNet1 {'lr': 0.005} had a validation accuracy of 0.986
ConvNet2 {'lr': 0.005} had a validation accuracy of 0.988
ConvNet3 {'lr': 0.005} had a validation accuracy of 0.990
ConvNet1 {'lr': 0.005, 'weight_decay': 0.0001} had a validation accuracy of 0.979
ConvNet2 {'lr': 0.005, 'weight_decay': 0.0001} had a validation accuracy of 0.989
ConvNet3 {'lr': 0.005, 'weight_decay': 0.0001} had a validation accuracy of 0.989
ConvNet1 {'lr': 0.01} had a validation accuracy of 0.986
ConvNet2 {'lr': 0.01} had a validation accuracy of 0.986
ConvNet3 {'lr': 0.01} had a validation accurac

### Select the best performing model and estimate its performance on unseen data

In [189]:
best_accuracy, best_index = torch.max(torch.tensor(accuracies), dim=0)
selected_model = models[best_index]
print("The best performing model was", selected_model._get_name(), optimizer_hyperparameters[best_index % len(architechtures)], end=" ")
print("with a validation accuracy of %.3f" % best_accuracy)

test_accuracy = compute_accuracy(selected_model, test_loader, device=DEVICE)
print("\nIt has a test accuracy of %.3f" %test_accuracy)

The best performing model was ConvNet2 {'lr': 0.001, 'weight_decay': 0.0001} with a validation accuracy of 0.991

It has a test accuracy of 0.991


### Store the best performing model

In [190]:
selected_model_path = MODEL_DIRECTORY + "/model.pt"
torch.save(model.state_dict(), selected_model_path)
print("Stored selected model under '", selected_model_path, "'", sep="")

Stored selected model under './models/model.pt'
