# MNIST Classifier

Uses the hyperparameters in the following cell to adjust the model:

In [47]:
BATCH_SIZE = 128
NUM_EPOCHS = 10
LEARNING_RATE = 0.01
HIDDEN_LAYERS = 10
HIDDEN_LAYER_UNITS = 256
DATA_SIZE = None # set to None, or if you want to nerf the model then set it to the number of training points to use
DROPOUT = 0
USE_BATCH_NORM = False
L1 = 0
L2 = 0

In [48]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

# Check if a GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    print('Running on GPU: ', end = "")
    print(torch.cuda.get_device_name(0), end = "")
    print(" 😎")
else:
    device = torch.device('cpu')
    print('Running on CPU')
    print("rip ☠️")

# Load the MNIST dataset
mnist_train = datasets.MNIST(root='.', train=True, transform=transforms.ToTensor(), download=True)
mnist_train, mnist_val = torch.utils.data.random_split(mnist_train, [len(mnist_train) - 10000, 10000])
if DATA_SIZE:
    mnist_train, _ = torch.utils.data.random_split(mnist_train, [DATA_SIZE, len(mnist_train) - DATA_SIZE])

mnist_test = datasets.MNIST(root='.', train=False, transform=transforms.ToTensor(), download=True)

# Create data loaders for the training and test sets
train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=BATCH_SIZE, shuffle=False)
val_loader = torch.utils.data.DataLoader(mnist_val, batch_size=BATCH_SIZE, shuffle=False)

# Define the neural network
class MNISTClassifier(nn.Module):
    def __init__(self, hidden_layers, hidden_units, num_classes, dropout= DROPOUT, bn = USE_BATCH_NORM):
        super().__init__()
        self.hidden_layers = hidden_layers
        self.hidden_units = hidden_units
        self.layers = nn.ModuleList([nn.Linear(28 * 28, hidden_units)])
        self.layers.extend([nn.Linear(hidden_units, hidden_units) for _ in range(hidden_layers - 1)])
        self.layers.append(nn.Linear(hidden_units, num_classes))
        self.dropout = dropout
        self.batch_norm = nn.BatchNorm1d(hidden_units)
        self.bn = bn

    def forward(self, x):
        x = x.reshape(-1, 28 * 28)
        for i in range(self.hidden_layers):
            x = torch.relu(self.layers[i](x))
            if self.bn:
                x = self.batch_norm(x)
            x = F.dropout(x, self.dropout)
        x = self.layers[-1](x)
        return x

# Initialize the model
model = MNISTClassifier(hidden_layers=HIDDEN_LAYERS, hidden_units=HIDDEN_LAYER_UNITS, num_classes=10).to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

# Train the model
def train(model, train_loader, val_loader, criterion, optimizer, epochs, L1, L2):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for images, labels in tqdm(train_loader, total=len(train_loader), desc=f'Epoch {epoch + 1}/{epochs}'):
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)

            # Get training statistics
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            loss = criterion(outputs, labels)

            # Add L1 regularization loss
            reg_loss = 0
            for param in model.parameters():
                reg_loss += torch.sum(torch.abs(param))
            loss += L1 * reg_loss

            # Add L2 regularization loss
            reg_loss = 0
            for param in model.parameters():
                reg_loss += torch.sum(torch.sum(param**2))
            loss += 0.5 * L2 * reg_loss
            
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        accuracy = 100 * correct / total
        val_accuracy, val_loss = validate(model, val_loader, criterion)
        print(f'Training   | Loss: {running_loss / len(train_loader):.4f} | Accuracy: {accuracy:.2f}% ')
        print(f'Validation | Loss: {val_loss / len(train_loader):.4f} | Accuracy: {val_accuracy:.2f}% ')


# Test the model
def validate(model, val_loader, criterion):
    with torch.no_grad():
        correct = 0
        total = 0
        val_loss = 0
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
        return accuracy, val_loss
        # print()
        # print(f'Accuracy of the network on the 10000 test images: {accuracy}')

def test(model, test_loader):
    print()
    print("Results on test data:")
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        all_labels = []
        all_preds = []
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_labels.extend(labels.tolist())
            all_preds.extend(predicted.tolist())
        accuracy = 100 * correct / total
        print(f'Accuracy: {accuracy:.2f}%')
        
        # Calculate confusion matrix
        confusion_matrix = torch.zeros(10, 10, dtype=torch.int64)
        for actual, pred in zip(all_labels, all_preds):
            confusion_matrix[actual, pred] += 1
        print()
        print(f'Confusion matrix: \n{confusion_matrix}')
        
        # Calculate precision, recall and F1 score
        precision = recall = f1 = 0
        for i in range(10):
            true_positive = confusion_matrix[i][i]
            false_positive = confusion_matrix[:, i].sum() - true_positive
            false_negative = confusion_matrix[i].sum() - true_positive
            if true_positive > 0:
                precision += true_positive / (true_positive + false_positive)
                recall += true_positive / (true_positive + false_negative)
                f1 += 2 * true_positive / (2 * true_positive + false_positive + false_negative)
        precision /= 10
        recall /= 10
        f1 /= 10
        print()
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')


Running on GPU: NVIDIA GeForce RTX 3060 Laptop GPU 😎


In [49]:
train(model, train_loader, val_loader, criterion, optimizer, NUM_EPOCHS, L1, L2)
test(model, test_loader)

Epoch 1/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 2.2889 | Accuracy: 27.03% 
Validation | Loss: 0.4587 | Accuracy: 37.97% 


Epoch 2/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 2.2090 | Accuracy: 48.46% 
Validation | Loss: 0.4208 | Accuracy: 52.31% 


Epoch 3/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 1.6435 | Accuracy: 60.91% 
Validation | Loss: 0.2213 | Accuracy: 69.13% 


Epoch 4/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 0.8469 | Accuracy: 75.44% 
Validation | Loss: 0.1427 | Accuracy: 79.09% 


Epoch 5/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 0.6259 | Accuracy: 81.54% 
Validation | Loss: 0.1157 | Accuracy: 82.92% 


Epoch 6/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 0.5224 | Accuracy: 84.86% 
Validation | Loss: 0.1003 | Accuracy: 85.44% 


Epoch 7/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 0.4590 | Accuracy: 86.75% 
Validation | Loss: 0.0899 | Accuracy: 87.17% 


Epoch 8/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 0.4159 | Accuracy: 88.06% 
Validation | Loss: 0.0833 | Accuracy: 88.12% 


Epoch 9/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 0.3841 | Accuracy: 89.04% 
Validation | Loss: 0.0784 | Accuracy: 88.70% 


Epoch 10/10:   0%|          | 0/391 [00:00<?, ?it/s]

Training   | Loss: 0.3595 | Accuracy: 89.69% 
Validation | Loss: 0.0737 | Accuracy: 89.48% 

Results on test data:
Accuracy: 89.99%

Confusion matrix: 
tensor([[ 955,    0,    5,    1,    0,   10,    5,    2,    2,    0],
        [   0, 1107,    1,    7,    0,    2,    3,    2,   13,    0],
        [  15,   19,  898,   18,   10,    2,   19,   13,   34,    4],
        [   3,    2,   24,  913,    0,   32,    0,   19,   13,    4],
        [   1,    3,    5,    1,  888,    1,   15,    1,   10,   57],
        [  17,    3,    8,   64,   10,  729,   13,    7,   33,    8],
        [  21,    3,   10,    0,   14,   20,  885,    0,    5,    0],
        [   4,   19,   23,    3,    4,    0,    0,  935,    3,   37],
        [   5,   12,    8,   39,   10,   45,   17,    7,  808,   23],
        [  12,    4,    1,   10,   39,   11,    0,   44,    7,  881]])

Precision: 0.90
Recall: 0.90
F1 Score: 0.90
