# MNIST Fashion Task for MLP implementation with PyTorch
### Initial preparations

In [1]:
# Import libraries
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd
from collections import Counter

# PyTorch for MLP
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms

# Dataset
from torchvision.datasets import FashionMNIST
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Fix random seeds
torch.manual_seed(123)
np.random.seed(123)

In [3]:
# Transformation pipeline
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

### Load training data

In [4]:
# Load FashionMNIST training dataset
fashion = FashionMNIST(
    root='./data', train=True, download=True, transform=transform
)

### Prepare the training data

In [5]:
# Split input features and labels
# Initialise lists
X = []
y = []

# Iterate through the data and split it
for image, label in fashion:
    X.append(image.flatten().numpy())
    y.append(label)

# Convert to numpy array
X = np.array(X)
y = np.array(y)

# Convert to PyTorch tensor
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

### Prepare the neural network

In [6]:
# Configure network architecture
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()  # Secures inheritance from the MLP class
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.fc1 = nn.Linear(input_size, hidden_size)  # Fully Connected (dense) layers
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.activation = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # Forward pass through the network
        x = self.activation(self.fc1(x))
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [7]:
# Set architecture constants
input_size = X.shape[1]
hidden_size = 64
output_size = len(np.unique(y))

### Train the network

In [8]:
# Training loop
# Build the network
mlp = MLP(input_size, hidden_size, output_size)

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Select the optimiser
optimizer = optim.SGD(mlp.parameters(), lr=0.01)

# Set the number of epochs
epochs = 10

# Train the network
for epoch in range(epochs):
    # Forward pass
    outputs = mlp(X_tensor)

    # Compute loss
    loss = criterion(outputs, y_tensor)

    # Zero gradients
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # Update weights
    optimizer.step()

    # Print loss
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [1/10], Loss: 2.3046
Epoch [2/10], Loss: 2.3045
Epoch [3/10], Loss: 2.3043
Epoch [4/10], Loss: 2.3042
Epoch [5/10], Loss: 2.3040
Epoch [6/10], Loss: 2.3039
Epoch [7/10], Loss: 2.3037
Epoch [8/10], Loss: 2.3035
Epoch [9/10], Loss: 2.3034
Epoch [10/10], Loss: 2.3032


### Evaluate the network

In [9]:
# Make predictions
with torch.no_grad():
    predictions = torch.argmax(mlp(X_tensor), dim=1)
print(f"Predictions: {predictions.numpy()}")

Predictions: [5 3 3 ... 5 7 7]


In [10]:
# Load test data
fashion_test = FashionMNIST(
    root='./data', train=False, download=True, transform=transform
)

In [11]:
# Prepare the test data for evaluation
# Initialise lists
X_test = []
y_test = []

# Iterate through the data and split it
for image, label in fashion_test:
    X_test.append(image.flatten().numpy())
    y_test.append(label)

# Convert to numpy array
X_test = np.array(X_test)
y_test = np.array(y_test)

# Convert to PyTorch tensor
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [12]:
# Evaluation
with torch.no_grad():
    # Run the model without gradients (ie with the weights arrived at at the
    # end of training)
    outputs_test = mlp(X_test_tensor)

    # Assign classes to the output
    _, predicted = torch.max(outputs_test, 1)  # Predicted will be a tensor

    # Find accuracy using sklearn accuracy_score, which requires numpy array
    accuracy = accuracy_score(y_test, predicted.numpy())
print(f"Test accuracy: {accuracy:.2f}")

Test accuracy: 0.09


The initial implementation with: lr = 0.01, hidden_size = 64, epochs = 10 achieves test accuracy of 0.09 but the target is a test accuracy >= 0.80.

Functionalise the above code for convenient testing and optimisation.

In [13]:
def eval_train_net(
    hidden_size, learning_rate, epochs,
    X_tensor, y_tensor, X_test_tensor, y_test
):
    '''
    Function to train and evaluate a neural network with one fully-connected
    hidden layer. It builds and trains the network, evaluating the model every
    100 epochs. The function returns the selected parameters and the accuracy
    of the final model.
    Parameters:
    hidden_size = number of neurons in the hidden layer
    learning_rate = learning rate (0, 1)
    epochs = number of training epochs, integer
        X_tensor = training data as PyTorch tensor
        X_test_tensor = test data as PyTorch tensor
    Output:
        result = list of hidden_size, learning_rate, epochs, accuracy
    '''
    # Build the network
    mlp = MLP(input_size, hidden_size, output_size)

    # Define the loss function
    criterion = nn.CrossEntropyLoss()

    # Select the optimiser
    optimizer = optim.SGD(mlp.parameters(), lr=learning_rate)

    # Train the network
    for epoch in range(epochs):
        # Forward pass
        outputs = mlp(X_tensor)

        # Compute loss
        loss = criterion(outputs, y_tensor)

        # Zero gradients
        optimizer.zero_grad()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        # Evaluate the network every 100 iterations
        if epoch % 100 == 0:
            # model.eval()
            with torch.no_grad():
                # Run the model without gradients (ie with the weights arrived
                # at at the end of training)
                outputs_test = mlp(X_test_tensor)

                # Assign classes to the output
                _, predicted = torch.max(outputs_test, 1)

            # Find accuracy using sklearn accuracy_score
            accuracy = accuracy_score(y_test, predicted.numpy())
            print(
                f"Epoch [{epoch+1}/{epochs}], Loss: {loss:.4f}, "
                f"Accuracy: {accuracy:.2f}"
            )

    result = [hidden_size, learning_rate, epochs, accuracy]
    return result, mlp

Baseline learning rate is slow and the number of epochs is low. Increasing both should improve the accuracy. Set learning rate 0.1 and extend the epochs to 2,001. By reporting the loss and accuracy every 100 epochs, I will be able to choose a suitable value for epochs.

In [14]:
result, _ = eval_train_net(
    hidden_size, 0.1, 2001, X_tensor, y_tensor, X_test_tensor, y_test_tensor
)
print(result)

Epoch [1/2001], Loss: 2.3023, Accuracy: 0.14
Epoch [101/2001], Loss: 1.8682, Accuracy: 0.67
Epoch [201/2001], Loss: 1.7509, Accuracy: 0.74
Epoch [301/2001], Loss: 1.7231, Accuracy: 0.76
Epoch [401/2001], Loss: 1.7078, Accuracy: 0.77
Epoch [501/2001], Loss: 1.6975, Accuracy: 0.78
Epoch [601/2001], Loss: 1.6901, Accuracy: 0.78
Epoch [701/2001], Loss: 1.6844, Accuracy: 0.78
Epoch [801/2001], Loss: 1.6800, Accuracy: 0.79
Epoch [901/2001], Loss: 1.6763, Accuracy: 0.79
Epoch [1001/2001], Loss: 1.6731, Accuracy: 0.79
Epoch [1101/2001], Loss: 1.6705, Accuracy: 0.79
Epoch [1201/2001], Loss: 1.6681, Accuracy: 0.80
Epoch [1301/2001], Loss: 1.6661, Accuracy: 0.80
Epoch [1401/2001], Loss: 1.6643, Accuracy: 0.80
Epoch [1501/2001], Loss: 1.6626, Accuracy: 0.80
Epoch [1601/2001], Loss: 1.6612, Accuracy: 0.80
Epoch [1701/2001], Loss: 1.6598, Accuracy: 0.80
Epoch [1801/2001], Loss: 1.6586, Accuracy: 0.80
Epoch [1901/2001], Loss: 1.6574, Accuracy: 0.80
Epoch [2001/2001], Loss: 1.6564, Accuracy: 0.80
[64,

These results indicate 1200 epochs is sufficient to achieve accuracy >= 0.80.

### Select and tune a hyperparameter
Hyperparameters control how a machine learning algorithm learns. They are user-defined constants in the training process, such as hidden_size, learning_rate and epochs. The baseline neural network was not particularly accurate, and by increasing learning_rate and epochs, I've improved the accuracy. I will fix the learning rate and epochs, although if a further significant improvement in performance occurs, it will be necessary to revisit both.

An important hyperparameter that is _hidden_ in this implementation is the batch size. The batch size is the number of subsamples used to update the gradients. The best measure of the gradients is to use all the samples before updating. This is computationally expensive and can lead to training scenarios where the model gets "stuck" at a local minima. But this approach takes the most direct route to convergence on the solution. Methods that use all samples in this way have a batch size equal to the number of samples, and are called (full) Batch Gradient Descent (BGD). 

The extreme opposite approach to BGD is Stochastic Gradient Descent (SGD), which calculates the gradient for a single sample before updating ie batch size of 1. This introduces noise into the gradient, which can be helpful in enabling the model to escape local minima during training, but the approach to convergence is indirect. SGD is computationally cheap in the backward pass, so for some models this can be tolerated.

Mini-Batch Gradient Descent (MBGD) lies between BGD and SGD, combining benefits of both. In this case the batch size becomes a tunable parameter capable of balancing computational cost and training stability. For classification problems batches are typically powers of 2 eg 32, 64, 128, 256 and 512. The batches are chosen without replacement. A training epoch completes once all the samples have been used. Since the largest power of 2 less than 60,000 is 2^15 =  32,768 and this is a lot less than 60,000 I will use multiples of 10 for my batches to avoid losing almost half of my training dataset. 

For this part of the task, I will tune batch size.

Tutorial reference: https://codesignal.com/learn/courses/pytorch-techniques-for-model-optimization/lessons/model-training-with-mini-batches-in-pytorch

PyTorch DataLoader creates mini-batches from the dataset. Pytorch TensorDataset(X, y) combines X and y into one tensor.

In [15]:
# Set batch size
batch_size = 10

Rework my eval_train_net function to fix hidden_size at 64, learning_rate at 0.1, and use DataLoader for the batches.

In [16]:
def eval_batch(
    X_tensor, y_tensor, X_test_tensor, y_test_tensor,
    batch_size, epochs=10, hidden_size=64, learning_rate=0.1
):
    '''
    Function to train and evaluate a neural network with one fully-connected
    hidden layer. It builds and trains the network, evaluating the model every
    100 epochs. The function returns the selected parameters and the accuracy
    of the final model.
    Parameters:
    hidden_size = number of neurons in the hidden layer
    learning_rate = learning rate (0, 1)
    epochs = number of training epochs, integer
        X_tensor = training data as PyTorch tensor
        X_test_tensor = test data as PyTorch tensor
    Output:
        result = list of hidden_size, learning_rate, epochs, accuracy
    '''
    # Combine dataset into one tensor
    dataset = TensorDataset(X_tensor, y_tensor)

    # Build the network
    mlp = MLP(input_size, hidden_size, output_size)

    # Define the loss function
    criterion = nn.CrossEntropyLoss()

    # Select the optimiser
    optimizer = optim.SGD(mlp.parameters(), lr=learning_rate)

    # Train the network in mini-batches
    for epoch in range(epochs):
        # Draw random mini-batch
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # Train the network on each mini-batch
        for batch_X, batch_y in data_loader:

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = mlp(batch_X)

            # Compute loss
            loss = criterion(outputs, batch_y)

            # Backward pass
            loss.backward()

            # Update weights
            optimizer.step()

        # Evaluate the network every 2 iterations
        if epoch % 2 == 0:
            with torch.no_grad():
                # Run the model without gradients (ie with the weights arrived
                # at at the end of training)
                outputs_test = mlp(X_test_tensor)

                # Assign classes to the output
                _, predicted = torch.max(outputs_test, 1)

            # Find accuracy using sklearn accuracy_score
            accuracy = accuracy_score(y_test, predicted.numpy())
            print(
                f"Epoch [{epoch+1}/{epochs}], Loss: {loss:.4f}, "
                f"Accuracy: {accuracy:.2f}"
            )

    result = [batch_size, accuracy]
    return result, mlp, accuracy

Test the eval_batch with batch_size = 10.

In [17]:
result, mlp, accuracy = eval_batch(
    X_tensor, y_tensor, X_test_tensor, y_test_tensor, batch_size
)
print(result)
print(mlp)
print(accuracy)

Epoch [1/10], Loss: 1.6277, Accuracy: 0.81
Epoch [3/10], Loss: 1.4615, Accuracy: 0.84
Epoch [5/10], Loss: 1.5591, Accuracy: 0.84
Epoch [7/10], Loss: 1.5436, Accuracy: 0.84
Epoch [9/10], Loss: 1.5615, Accuracy: 0.85
[10, 0.8527]
MLP(
  (fc1): Linear(in_features=784, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=10, bias=True)
  (activation): ReLU()
  (softmax): Softmax(dim=1)
)
0.8527


This shows a significant performance improvement both in run time (not recorded but I estimate 10-20 seconds rather than 1-2 minutes) and accuracy improves to 0.85.

This is encouraging but it isn't hyperparameter tuning, since a batch size of 10 was simply chosen as a starting point.

In [18]:
# Define list of batch_sizes to try
batch_ls = [2, 4, 6, 10, 15, 20]

In [19]:
# Initialise result list of result tuples
batch_results = []

# Loop through batch_ls sizes
for batch in batch_ls:
    result, _, _ = eval_batch(
        X_tensor, y_tensor, X_test_tensor, y_test_tensor, batch
    )
    batch_results.append(result)

Epoch [1/10], Loss: 1.9611, Accuracy: 0.59
Epoch [3/10], Loss: 1.4612, Accuracy: 0.67
Epoch [5/10], Loss: 1.4612, Accuracy: 0.65
Epoch [7/10], Loss: 1.4612, Accuracy: 0.67
Epoch [9/10], Loss: 1.9612, Accuracy: 0.61
Epoch [1/10], Loss: 1.7112, Accuracy: 0.75
Epoch [3/10], Loss: 1.7112, Accuracy: 0.76
Epoch [5/10], Loss: 1.4612, Accuracy: 0.79
Epoch [7/10], Loss: 1.4612, Accuracy: 0.78
Epoch [9/10], Loss: 1.4612, Accuracy: 0.79
Epoch [1/10], Loss: 1.6278, Accuracy: 0.80
Epoch [3/10], Loss: 1.6278, Accuracy: 0.82
Epoch [5/10], Loss: 1.4613, Accuracy: 0.83
Epoch [7/10], Loss: 1.6278, Accuracy: 0.83
Epoch [9/10], Loss: 1.6278, Accuracy: 0.84
Epoch [1/10], Loss: 1.5648, Accuracy: 0.78
Epoch [3/10], Loss: 1.5614, Accuracy: 0.80
Epoch [5/10], Loss: 1.7571, Accuracy: 0.81
Epoch [7/10], Loss: 1.5612, Accuracy: 0.82
Epoch [9/10], Loss: 1.6594, Accuracy: 0.80
Epoch [1/10], Loss: 1.5884, Accuracy: 0.80
Epoch [3/10], Loss: 1.7712, Accuracy: 0.81
Epoch [5/10], Loss: 1.5940, Accuracy: 0.80
Epoch [7/10

In [20]:
# Display the results using a pandas DataFrame
df = pd.DataFrame(batch_results, columns=['Batch_size', 'Accuracy_score'])
df

Unnamed: 0,Batch_size,Accuracy_score
0,2,0.6088
1,4,0.789
2,6,0.8362
3,10,0.7961
4,15,0.8182
5,20,0.8106


The results show I was lucky with the batch size of 10 as a test run, since it does seem to be the best option.

In [21]:
# Revisiting learning_rate=0.01.
result, mlp_0_01, accuracy = eval_batch(
    X_tensor, y_tensor, X_test_tensor, y_test_tensor,
    batch_size=10, epochs=20, hidden_size=64, learning_rate=0.01
)
print(result)

Epoch [1/20], Loss: 1.8117, Accuracy: 0.78
Epoch [3/20], Loss: 1.6694, Accuracy: 0.80
Epoch [5/20], Loss: 1.5504, Accuracy: 0.80
Epoch [7/20], Loss: 1.6221, Accuracy: 0.81
Epoch [9/20], Loss: 1.6510, Accuracy: 0.81
Epoch [11/20], Loss: 1.6749, Accuracy: 0.81
Epoch [13/20], Loss: 1.6295, Accuracy: 0.81
Epoch [15/20], Loss: 1.7970, Accuracy: 0.81
Epoch [17/20], Loss: 1.6630, Accuracy: 0.82
Epoch [19/20], Loss: 1.5642, Accuracy: 0.82
[10, 0.8179]


The learning rate of 0.01 has a similar performance to using 0.1 at 10 epochs. However, with more than 10 epochs, the accuracy continues to improve. This suggests 0.01 is a better choice, as it is more likely to avoid overfitting.

### Confusion matrix

In [22]:
def evaluate_model(model, test_loader):
    '''
    Function provided in the task to evaluate the model. The code provided in
    the task to print the confusion matrix:
    pred, labels = evaluate_model(model, test_loader)
    conf_matrix = confusion_matrix(labels, preds)
    print("Confusion Matrix: ")
    print(conf_matrix)
    '''
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    return all_preds, all_labels

In [23]:
# Prepare test_data
test_data = TensorDataset(X_test_tensor, y_test_tensor)

# Set batch size
test_batch = 10

# Prepare test_loader
test_loader = DataLoader(test_data, batch_size=test_batch, shuffle=True)

In [24]:
# Print the confusion matrix
preds, labels = evaluate_model(mlp_0_01, test_loader)
conf_matrix = confusion_matrix(labels, preds)
print("Confusion Matrix: ")
print(conf_matrix)

Confusion Matrix: 
[[905   0  24  45   7   1   0   1  17   0]
 [  4 961   2  25   6   0   0   0   2   0]
 [ 23   4 794  17 154   1   0   0   7   0]
 [ 40  12  18 890  35   1   0   0   4   0]
 [  3   1  92  34 865   0   0   0   5   0]
 [  1   0   0   1   0 891   0  64   4  39]
 [315   1 230  51 379   0   0   0  24   0]
 [  0   0   0   0   0  15   0 932   1  52]
 [  5   1  10   7   4   4   0   8 960   1]
 [  0   0   0   0   0   7   0  32   1 960]]


In a confusion matrix, the rows represent observed data (aka ground truth) and the columns represent predicted data. The NW-SE diagonal quantifies how good the model is at correctly predicting each class. Thus the confusion matrix for a perfect set of predictions would be a scalar multiple of the identity matrix ie all off-diagonal elements would be zero.

Before interpreting the confusion matrix, let's check the distribution of unique values in preds (predictions) and labels (ground truth). Ideally the unique values will be uniformly distributed, if they are not, the imbalance can influence the results.

In [25]:
# Find the unique counts in preds and labels
unique_pred = Counter(preds).keys()
unique_pred_count = Counter(preds).values()

unique_label = Counter(labels).keys()
unique_label_count = Counter(labels).values()

print("Predictions: ")
print(unique_pred)
print(unique_pred_count)

print("Labels: ")
print(unique_label)
print(unique_label_count)

Predictions: 
dict_keys([8, 5, 9, 4, 0, 1, 3, 2, 7])
dict_values([1025, 920, 1052, 1450, 1296, 980, 1070, 1170, 1037])
Labels: 
dict_keys([8, 5, 9, 4, 0, 6, 1, 3, 2, 7])
dict_values([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000])


The predicted and ground truth data unique values are evenly distributed so we can consider the confusion matrix. The rows with the most non-zero elements off the diagonal have indices 4 and 6. The model has the greatest difficulty correctly identifying those elements. In particular, this model never correctly identifies row 6.

We can also see this from the classification report, where precision and f1-scores for rows 4 and 6 are significantly less than for the other rows. Recall is more variable across all the classes. The overall accuracy is 0.82.

In [26]:
# Classification report
print(classification_report(labels, preds, digits=3, zero_division=0))

              precision    recall  f1-score   support

           0      0.698     0.905     0.788      1000
           1      0.981     0.961     0.971      1000
           2      0.679     0.794     0.732      1000
           3      0.832     0.890     0.860      1000
           4      0.597     0.865     0.706      1000
           5      0.968     0.891     0.928      1000
           6      0.000     0.000     0.000      1000
           7      0.899     0.932     0.915      1000
           8      0.937     0.960     0.948      1000
           9      0.913     0.960     0.936      1000

    accuracy                          0.816     10000
   macro avg      0.750     0.816     0.778     10000
weighted avg      0.750     0.816     0.778     10000



Let's try the mlp model (ie learning rate 0.1).

In [27]:
# Print the confusion matrix
preds, labels = evaluate_model(mlp, test_loader)
conf_matrix = confusion_matrix(labels, preds)
print("Confusion Matrix: ")
print(conf_matrix)

Confusion Matrix: 
[[842   1   9  32   4   0  90   1  21   0]
 [  4 960   0  25   5   0   5   0   1   0]
 [ 15   2 698  18 134   0 129   0   4   0]
 [ 39  11   8 866  38   0  30   0   8   0]
 [  1   1  64  33 793   0 101   0   7   0]
 [  0   0   0   1   0 894   1  64   6  34]
 [161   1  75  38  77   0 626   1  21   0]
 [  0   0   0   0   0  16   0 957   0  27]
 [  0   1   1   7   4   2  14   6 965   0]
 [  0   0   0   0   0   6   0  58   1 935]]


In [28]:
# Classification report
print(classification_report(labels, preds, digits=3, zero_division=0))

              precision    recall  f1-score   support

           0      0.793     0.842     0.817      1000
           1      0.983     0.960     0.971      1000
           2      0.816     0.698     0.753      1000
           3      0.849     0.866     0.857      1000
           4      0.752     0.793     0.772      1000
           5      0.974     0.894     0.932      1000
           6      0.629     0.626     0.627      1000
           7      0.880     0.957     0.917      1000
           8      0.933     0.965     0.949      1000
           9      0.939     0.935     0.937      1000

    accuracy                          0.854     10000
   macro avg      0.855     0.854     0.853     10000
weighted avg      0.855     0.854     0.853     10000



This is a much better result as the model now predicts all possible outcomes. The classes 4 and 6 are still the most challenging for the model: these classes are coat and shirt, which are clearly more similar in shape than, say ankle boot and coat.