In [87]:
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from PIL import Image

## I. Data Handling

In [88]:
transform = transforms.Compose([
    # ensure images are of uniform size
    # this is necessary for the neural network to be able to process the images
    transforms.Resize((224, 224)),

    # ensure the end result is a tensor
    # this converts the image from [0,255] to [0,1]
    # making it compatible with the neural network
    transforms.ToTensor(),

    # normalise the data
    transforms.Normalize(
        # this is the mean and standard deviation of the ImageNet dataset
        # the pixel value, for each channel, is subtracted by the mean and divided by the standard deviation
        # this should shift the pixel values to be centred around zero (mean: 0, std: 1)
        # this allows faster and more stable convergence during optimisation
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# LOAD DATASET
rootDir = globals()['_dh'][0] # os.path.dirname(os.path.abspath(__file__))
dataDir = os.path.join(rootDir, '..', 'data', 'art')
dataSet = ImageFolder(root=dataDir, transform=transform) # load each subdirectory as a class

print(dataSet)
print(dataSet.classes)

Dataset ImageFolder
    Number of datapoints: 15
    Root location: d:\Programming\UoM\virtual-turntable\modelling\sandbox\..\data\art
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )
['AbbeyRoad_TheBeatles', 'InRainbows_Radiohead', "JeffWayne'sMusicalVersionofTheWaroftheWorlds_JeffWayne", 'KidA_Radiohead', 'OKComputer_Radiohead', 'Revolver_TheBeatles', "Sgt.Pepper'sLonelyHeartsClubBand_TheBeatles", 'TheDarkSideOfTheMoon_PinkFloyd', 'TheRiseAndFallOfZiggyStardustAndTheSpidersFromMars_DavidBowie', 'TheVelvetUnderground&Nico_TheVelvetUnderground&Nico', 'WishYouWereHere_PinkFloyd']


Note: Data augmentation should be done in the future, to create a more robust dataset.

In [89]:
# SPLIT DATA
trainLoader = DataLoader(dataSet, batch_size=8, shuffle=True)
validationLoader = DataLoader(dataSet, batch_size=8, shuffle=True)

## II. Model

In [90]:
class SimpleCNN(nn.Module):
    """
    This is a simple convolutional neural network, for POC.
    It has two convolutional layers and two fully connected layers (see below).
    It effectively learns the 'ID' of the image (albumName_artistName).
    """

    def __init__(self, numClasses):
        super(SimpleCNN, self).__init__()

        # This is the architecture of the neural network.
        # It is composed of two convolutional layers and two fully connected layers.
        # conv1 -> pool -> conv2 -> pool -> fc1 -> fc2

        # CONVOLUTIONAL LAYER 1
        self.conv1 = nn.Conv2d(
            3, # 3 channels (RBG)
            32, # 32 output filters (feature maps)
            kernel_size=3, # each filter will scan 3x3 patches of the image
            stride=1, # each filter will move 1 pixel at a time
            padding=1 # the input is padded, to esnure the output is the same size as the input
        )

        # MAX-POOLING LAYER
        # reduces spatial dimensions of the input feature maps
        # this reduces the number of parameters and computations in the network
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)

        # CONVOLUTIONAL LAYER 2
        # the 32 feature maps are now fed into a second layer, resulting in 64 output filters
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)

        # FULLY CONNECTED LAYER 1
        self.fc1 = nn.Linear(
            64 * 56 * 56, # 64 feature maps, each 56x56 pixels
            128 # 128 output neurons
        )

        # FULLY CONNECTED LAYER 2
        # this layer outputs logits (raw scores) for each class
        self.fc2 = nn.Linear(128, numClasses)

    def forward(self, x):
        """_summary_

        Args:
            x (_type_): _description_

        Returns:
            _type_: _description_
        """

        # pool the first convolutional layer
        # the size of the feature maps are halved
        x = self.pool(
            # apply the ReLU activation function
            # this introduces non-linearity to the model
            torch.relu(
                # apply the first convolutional layer
                # the 32 filters are applied to the input image
                self.conv1(x)
            )
        )

        # pool the second convolutional layer
        x = self.pool(torch.relu(self.conv2(x)))

        # flatten the feature maps
        x = x.view(-1, 64 * 56 * 56)

        # fully-connected layers
        x = torch.relu(self.fc1(x))
        x = self.fc2(x) # no activation function, as this is handled by the loss function

        return x

numClasses = len(dataSet.classes)
model = SimpleCNN(numClasses=numClasses)

## III. Training

In [91]:
# LOSS FUNCTION
# used to compute the error between the model's predictions and the true labels
critereon = nn.CrossEntropyLoss()

# OPTIMISER
# updates the model's weights, based on gradients
optimiser = optim.Adam(model.parameters(), lr=0.001)

In [92]:
def train(model, trainLoader, critereon, optimiser, epochs=5):
    """_summary_

    Args:
        model (_type_): _description_
        trainLoader (_type_): _description_
        critereon (_type_): _description_
        optimiser (_type_): _description_
        epochs (int, optional): _description_. Defaults to 5.
    """

    # EPOCH LOOP
    # an epoch is a complete pass through the dataset
    # we do this several times, to allow the model to learn
    for epoch in range(epochs):

        model.train() # set the model to training mode (this is necessary for dropout and batch normalisation)

        # initialsie some statistic-trackers
        runningLoss = 0.0 # cumulative loss
        correct, total = 0, 0 # correct predictions, total predictions

        # TRAINING LOOP
        for images, labels in trainLoader:
            optimiser.zero_grad() # prevent accumulated gradients from previous iterations overflowing

            # FORWARD PASS
            # pass the images into the model, producing a prediction
            outputs = model(images)

            # COMPUTE LOSS
            # compare the model's predictions to the true labels
            loss = critereon(outputs, labels)

            # BACKPROPAGATION
            # compute the gradients of the loss, with respect to the model's parameters/weights
            loss.backward()
            # and update the model's weights accordingly
            optimiser.step()

            # track stats
            runningLoss += loss.item()
            _, predicted = torch.max(outputs, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        print(f'(Epoch {epoch+1}) Loss: {runningLoss/len(trainLoader)}\t Accuracy: {correct/total}')

train(model, trainLoader, critereon, optimiser, epochs=5)

(Epoch 1) Loss: 17.850367426872253	 Accuracy: 0.13333333333333333
(Epoch 2) Loss: 3.849959373474121	 Accuracy: 0.3333333333333333
(Epoch 3) Loss: 0.41283509880304337	 Accuracy: 0.7333333333333333
(Epoch 4) Loss: 0.024807718116790056	 Accuracy: 1.0
(Epoch 5) Loss: 0.02074203360825777	 Accuracy: 1.0


## IV. Validation

Seen data (hopefully, expacted).

In [93]:
testImage = Image.open(os.path.join(rootDir, '..', 'data', 'misc', "JeffWayne'sMusicalVersionofTheWaroftheWorlds_JeffWayne(2).png"))
testImage = transform(testImage)
testImage = testImage.unsqueeze(0) # add batch dimension, as model expects it

model.eval()

with torch.no_grad():
    outputs = model(testImage)

    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    predictedProb, predictedClass = torch.max(probabilities, 1)

    print(f'Predicted: {dataSet.classes[predictedClass.item()]} ({predictedProb.item()})')

Predicted: JeffWayne'sMusicalVersionofTheWaroftheWorlds_JeffWayne (0.999643087387085)


Unseen data (the model is not expected to perform well).

In [94]:
testImage = Image.open(os.path.join(rootDir, '..', 'data', 'misc', 'ABBA_ABBA.png'))
testImage = transform(testImage).unsqueeze(0)

model.eval()

with torch.no_grad():
    outputs = model(testImage)

    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    predictedProb, predictedClass = torch.max(probabilities, 1)

    print(f'Predicted: {dataSet.classes[predictedClass.item()]} ({predictedProb.item()})')

Predicted: OKComputer_Radiohead (0.6633164286613464)


Note: the sum of the probabilities will be 1. Therefore, with a small datatset, the model may be confidently wrong (i.e. a high probability for the wrong class), as this is moreso a measure of 'how confident I am it is this, compared to the other options', as opposed to 'how confident I am that it is this, and not anything else'. This is important to note when interpreting the results (we should have a high standard for confidence). However, as the dataset grows, the liklihood of any one class being highly-favoured, when the true result lies outside of the trained classes, should decrease.