**Implementation of ResNet-18**

_Maybe later try to implement ResNet-50 with the "bottleneck" building blocks from the He et al. (2015) paper_

# Initialisation

## Import libraries

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchvision import datasets
from torchvision.transforms import ToTensor

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

from utils import *

print(f'PyTorch version: {torch.__version__}')


PyTorch version: 2.0.0


## Set device

In [3]:
torch.manual_seed(123)

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f'Using device: {device}')

Using device: mps


## Initialise variables

In [6]:
# Data dimensions: [batch, channels, height, width)

batch_size = 256
image_channels = 3
image_dim = (224, 224)

In [7]:
# Generate dummy data for testing

X = torch.rand(size=(batch_size, image_channels, *image_dim))
print(X.size())

torch.Size([256, 3, 224, 224])


# Build model

In [12]:
class ResBlock(nn.Module):
    def __init__(self, in_channels=64, out_channels=64, stride=1):
        super().__init__()

        self.stride = stride

        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=stride,
                               padding=1)
        self.conv2 = nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)

        # Batch normalisation after each convolutional layer
        self.batch_norm1 = nn.BatchNorm2d(num_features=out_channels)
        self.batch_norm2 = nn.BatchNorm2d(num_features=out_channels)

        # 1x1 convolution for residual connection
        self.conv_skip = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=2,
                                   padding=0)

    def conv_block(self, x):
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.batch_norm2(x)

        return x

    def forward(self, x):
        if self.stride == 1:
            return self.conv_block(x) + x

        # When padding in the conv block is 2 the image dimensions are halved and the number of filters doubled
        # To match the output size of the conv block with that of the skip connection, 1x1 convolution is performed
        # in the skip connection
        if self.stride == 2:
            return self.conv_block(x) + self.conv_skip(x)


In [19]:
class ResNet18(nn.Module):
    def __init__(self, image_channels=1):
        super().__init__()

        self.conv0 = nn.Conv2d(in_channels=image_channels, out_channels=64, kernel_size=7, stride=2, padding=3)
        self.pool0 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        # Cannot confirm if the padding values are correct, but these return correctly sized output (56x56 after the max pool)

        self.conv64_1 = ResBlock(in_channels=64, out_channels=64, stride=1)
        self.conv64_2 = ResBlock(in_channels=64, out_channels=64, stride=1)

        self.conv128_1 = ResBlock(in_channels=64, out_channels=128, stride=2)
        self.conv128_2 = ResBlock(in_channels=128, out_channels=128, stride=1)

        self.conv256_1 = ResBlock(in_channels=128, out_channels=256, stride=2)
        self.conv256_2 = ResBlock(in_channels=256, out_channels=256, stride=1)

        self.conv512_1 = ResBlock(in_channels=256, out_channels=512, stride=2)
        self.conv512_2 = ResBlock(in_channels=512, out_channels=512, stride=1)

        self.fc1 = nn.Linear(in_features=512, out_features=1000)
        self.fc2 = nn.Linear(in_features=1000, out_features=10)

    def forward(self, x):
        # Initial convolutional layer (7x7) + pooling
        x = self.conv0(x)
        x = self.pool0(x)

        # Residual/skip connection blocks
        x = self.conv64_1(x)
        x = self.conv64_2(x)

        x = self.conv128_1(x)
        x = self.conv128_2(x)

        x = self.conv256_1(x)
        x = self.conv256_2(x)

        x = self.conv512_1(x)
        x = self.conv512_2(x)

        # Global average pooling
        x = x.mean(dim=(2, 3))

        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x

In [20]:
m = ResNet18(image_channels=3)
p = m(X)
p.shape

torch.Size([256, 512])

In [45]:
summary(m, input_size=(1, 28, 28), batch_size=batch_size)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [256, 64, 14, 14]           3,200
         MaxPool2d-2            [256, 64, 7, 7]               0
            Conv2d-3            [256, 64, 7, 7]          36,928
       BatchNorm2d-4            [256, 64, 7, 7]             128
            Conv2d-5            [256, 64, 7, 7]          36,928
       BatchNorm2d-6            [256, 64, 7, 7]             128
          ResBlock-7            [256, 64, 7, 7]               0
            Conv2d-8            [256, 64, 7, 7]          36,928
       BatchNorm2d-9            [256, 64, 7, 7]             128
           Conv2d-10            [256, 64, 7, 7]          36,928
      BatchNorm2d-11            [256, 64, 7, 7]             128
         ResBlock-12            [256, 64, 7, 7]               0
           Conv2d-13           [256, 128, 4, 4]          73,856
      BatchNorm2d-14           [256, 12

# Train model on Fashion MNIST

## Load data

In [46]:
train_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

labels = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal',
          'Shirt', 'Sneaker', 'Bag', 'Ankle Boot']

### Split training data into training and validation subset

In [47]:
from torch.utils.data import random_split

train_subset, valid_subset = random_split(train_data, lengths=[0.8, 0.2])

### Create dataloaders

In [48]:
train_loader = torch.utils.data.DataLoader(train_subset, batch_size=batch_size, shuffle=True, num_workers=2)
valid_loader = torch.utils.data.DataLoader(valid_subset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=2)

## Create model

### Model

In [49]:
model = ResNet18(image_channels=1)
model.to(device)

ResNet18(
  (conv0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
  (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (conv64_1): ResBlock(
    (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (batch_norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (batch_norm2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_skip): Conv2d(64, 64, kernel_size=(1, 1), stride=(2, 2))
  )
  (conv64_2): ResBlock(
    (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (batch_norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (batch_norm2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=Tru

### Set hyperparameters

In [50]:
lr = 0.01
n_epochs = 30

### Loss, optimiser, scheduler

In [51]:
loss_func = nn.CrossEntropyLoss()
optimiser = torch.optim.SGD(params=model.parameters(), lr=lr, momentum=0.9, weight_decay=0.0001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimiser, factor=0.1,
                                                       patience=2)  # patience parameter was not specified in paper

results = pd.DataFrame(columns=['epoch', 'dataset', 'metric', 'value'])

In [52]:
for e in range(n_epochs):

    train_loss, train_accuracy = train_epoch(model, train_loader, loss_func, optimiser, device)
    valid_loss, valid_accuracy = eval_epoch(model, valid_loader, loss_func, device)

    current_lr = [p['lr'] for p in optimiser.param_groups]
    print(f"Epoch {e+1:>2}/{n_epochs} - Learning rate: {current_lr}")
    # print(f"Epoch {e+1:>2}/{n_epochs}")
    print(f"{'Training loss:':>18} {train_loss:.4f} - Accuracy: {train_accuracy:.3f}")
    print(f"{'Validation loss:':>18} {valid_loss:.4f} - Accuracy: {valid_accuracy:.3f}")

    train_result = pd.DataFrame(data={'epoch': e, 'dataset': 'training', 'metric': ['loss', 'accuracy'], 'value': [train_loss, train_accuracy]})
    valid_result = pd.DataFrame(data={'epoch': e, 'dataset': 'validation', 'metric': ['loss', 'accuracy'], 'value': [valid_loss, valid_accuracy]})

    results = pd.concat([results, train_result, valid_result], axis=0)

    scheduler.step(valid_loss)

Epoch  1/30 - Learning rate: [0.01]
    Training loss: 0.4436 - Accuracy: 0.837
        Test loss: 0.3584 - Accuracy: 0.869
Epoch  2/30 - Learning rate: [0.01]
    Training loss: 0.2944 - Accuracy: 0.890
        Test loss: 0.2926 - Accuracy: 0.892
Epoch  3/30 - Learning rate: [0.01]
    Training loss: 0.2448 - Accuracy: 0.909
        Test loss: 0.2997 - Accuracy: 0.883
Epoch  4/30 - Learning rate: [0.01]
    Training loss: 0.2185 - Accuracy: 0.917
        Test loss: 0.2908 - Accuracy: 0.894
Epoch  5/30 - Learning rate: [0.01]
    Training loss: 0.1952 - Accuracy: 0.925
        Test loss: 0.2841 - Accuracy: 0.897
Epoch  6/30 - Learning rate: [0.01]
    Training loss: 0.1706 - Accuracy: 0.934
        Test loss: 0.2945 - Accuracy: 0.899
Epoch  7/30 - Learning rate: [0.01]
    Training loss: 0.1580 - Accuracy: 0.940
        Test loss: 0.2808 - Accuracy: 0.906
Epoch  8/30 - Learning rate: [0.01]
    Training loss: 0.1384 - Accuracy: 0.947
        Test loss: 0.3299 - Accuracy: 0.887
Epoch  9

------

In [21]:

"""
trained_model, results = train_model(model=model, train_loader=train_loader, valid_loader=valid_loader,
loss_func=loss_func, optimiser=optimiser,
                                     batch_size=batch_size,
                                     epochs=n_epochs,
                                     device=device, scheduler=scheduler)
"""

Epoch  1/30 - Learning rate: [0.1]
    Training loss: inf - Accuracy: 0.107
        Test loss: 2.3033 - Accuracy: 0.096
Epoch  2/30 - Learning rate: [0.1]
    Training loss: 2.3035 - Accuracy: 0.100
        Test loss: 2.3032 - Accuracy: 0.102
Epoch  3/30 - Learning rate: [0.1]
    Training loss: 2.3036 - Accuracy: 0.101
        Test loss: 2.3033 - Accuracy: 0.096
Epoch  4/30 - Learning rate: [0.1]
    Training loss: 2.3035 - Accuracy: 0.100
        Test loss: 2.3031 - Accuracy: 0.100
Epoch  5/30 - Learning rate: [0.1]
    Training loss: 2.3035 - Accuracy: 0.099
        Test loss: 2.3034 - Accuracy: 0.100
Epoch  6/30 - Learning rate: [0.1]
    Training loss: 2.3034 - Accuracy: 0.101
        Test loss: 2.3038 - Accuracy: 0.101
Epoch  7/30 - Learning rate: [0.1]
    Training loss: 2.3035 - Accuracy: 0.099
        Test loss: 2.3034 - Accuracy: 0.101


KeyboardInterrupt: 

In [11]:
model, result = train_model(model_class=ResNet18, train_loader=train_loader, valid_loader=valid_loader, loss_func_class=nn.CrossEntropyLoss, optimiser_class=torch.optim.AdamW, batch_size=batch_size, epochs=n_epochs, start_lr=lr, device=device)

Epoch  1/30
    Training loss: inf - Accuracy: 0.137
        Test loss: inf - Accuracy: 0.231
Epoch  2/30
    Training loss: inf - Accuracy: 0.269
        Test loss: inf - Accuracy: 0.257
Epoch  3/30
    Training loss: inf - Accuracy: 0.236
        Test loss: inf - Accuracy: 0.141
Epoch  4/30
    Training loss: inf - Accuracy: 0.129
        Test loss: inf - Accuracy: 0.200
Epoch  5/30
    Training loss: inf - Accuracy: 0.127
        Test loss: inf - Accuracy: 0.112


KeyboardInterrupt: 

With the train_model function that takes the model class and creates the model inside the function, it works. With the other one that takes an initialised model as input it doesn't.