• Implement a simple CNN with a convolutional layer having 32 filters of size 3x3, a maxpool layer, a fully connected layer with 128 neurons and an output layer with 10 neurons (for the 10 classes) and ReLU activation. Train on MNIST dataset.
• Additionally, use two any pretrained CNNs of your choice (e.g. AlexNet, MobileNet, or EfficientNet) for inference.
• Compare all three models:
	- Accuracy, F1-score, confusion matrix
	- Model size (number of parameters)
	- Inference time on test set


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import time


Device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Transform & MNIST Dataset

transform = transforms.Compose([
    transforms.ToTensor(),          # Convert images to tensor
    transforms.Normalize((0.5,), (0.5,))  # Normalize between -1 and 1
])

# Load training and test datasets
train_dataset = datasets.MNIST('./datasets', train=True, download=True, transform=transform)
test_dataset  = datasets.MNIST('./datasets', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=0)
test_loader  = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=0)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./datasets\MNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:14<00:00, 665kB/s] 


Extracting ./datasets\MNIST\raw\train-images-idx3-ubyte.gz to ./datasets\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./datasets\MNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 107kB/s]


Extracting ./datasets\MNIST\raw\train-labels-idx1-ubyte.gz to ./datasets\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./datasets\MNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:03<00:00, 492kB/s]


Extracting ./datasets\MNIST\raw\t10k-images-idx3-ubyte.gz to ./datasets\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./datasets\MNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 765kB/s]

Extracting ./datasets\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./datasets\MNIST\raw






In [2]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # 1 input channel (grayscale), 32 output channels, 3x3 kernel
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)  # 2x2 max pooling
        # Fully connected layers
        self.fc1 = nn.Linear(32*14*14, 128)  # after pooling, image is 14x14
        self.fc2 = nn.Linear(128, 10)        # 10 classes

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)                      # Max pooling
        x = x.view(x.size(0), -1)             # Flatten
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)                       # Output logits
        return x


In [4]:
# Training function
def train_model(model, train_loader, epochs=150, lr=0.0005):
    model.to(Device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(Device), labels.to(Device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * images.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}")
    print("Training finished")
    return model


In [5]:
# Evaluation function

def evaluate_model(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    start_time = time.time()
    
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(Device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    
    inference_time = time.time() - start_time
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    cm = confusion_matrix(all_labels, all_preds)
    
    return acc, f1, cm, inference_time


In [6]:
# Initialize and train Simple CNN
cnn_model = SimpleCNN()
cnn_model = train_model(cnn_model, train_loader, epochs=150, lr=0.0005)


Epoch [1/150], Loss: 0.3529
Epoch [2/150], Loss: 0.1184
Epoch [3/150], Loss: 0.0782
Epoch [4/150], Loss: 0.0603
Epoch [5/150], Loss: 0.0487
Epoch [6/150], Loss: 0.0405
Epoch [7/150], Loss: 0.0331
Epoch [8/150], Loss: 0.0302
Epoch [9/150], Loss: 0.0248
Epoch [10/150], Loss: 0.0208
Epoch [11/150], Loss: 0.0172
Epoch [12/150], Loss: 0.0158
Epoch [13/150], Loss: 0.0124
Epoch [14/150], Loss: 0.0109
Epoch [15/150], Loss: 0.0088
Epoch [16/150], Loss: 0.0082
Epoch [17/150], Loss: 0.0062
Epoch [18/150], Loss: 0.0077
Epoch [19/150], Loss: 0.0044
Epoch [20/150], Loss: 0.0044
Epoch [21/150], Loss: 0.0046
Epoch [22/150], Loss: 0.0030
Epoch [23/150], Loss: 0.0054
Epoch [24/150], Loss: 0.0052
Epoch [25/150], Loss: 0.0026
Epoch [26/150], Loss: 0.0014
Epoch [27/150], Loss: 0.0027
Epoch [28/150], Loss: 0.0010
Epoch [29/150], Loss: 0.0011
Epoch [30/150], Loss: 0.0076
Epoch [31/150], Loss: 0.0013
Epoch [32/150], Loss: 0.0003
Epoch [33/150], Loss: 0.0003
Epoch [34/150], Loss: 0.0047
Epoch [35/150], Loss: 0

In [13]:
# Preprocessing for pretrained CNNs
preprocess = transforms.Compose([
    transforms.Resize(224),
    transforms.Grayscale(num_output_channels=3),  # 1→3 channels
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # pretrained models normalization
                         std=[0.229, 0.224, 0.225])
])

# MNIST test set transformed for pretrained models
pretrained_test_dataset = datasets.MNIST(
    './datasets', train=False, download=True, transform=preprocess
)
pretrained_test_loader = DataLoader(pretrained_test_dataset, batch_size=128, shuffle=False)


In [14]:
# MobileNetV2 and EfficientNet-B0
mobilenet = models.mobilenet_v2(pretrained=True)
efficientnet = models.efficientnet_b0(pretrained=True)

# Modify final layer for 10 classes (MNIST)
mobilenet.classifier[1] = nn.Linear(mobilenet.last_channel, 10)
efficientnet.classifier[1] = nn.Linear(efficientnet.classifier[1].in_features, 10)

mobilenet = mobilenet.to(Device)
efficientnet = efficientnet.to(Device)

# Set to eval mode (inference only)
mobilenet.eval()
efficientnet.eval()




EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [15]:
def evaluate_pretrained(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    start_time = time.time()
    
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(Device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    
    inference_time = time.time() - start_time
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    cm = confusion_matrix(all_labels, all_preds)
    return acc, f1, cm, inference_time


In [16]:
# Simple CNN
acc_cnn, f1_cnn, cm_cnn, time_cnn = evaluate_model(cnn_model, test_loader)

# MobileNetV2
acc_mobilenet, f1_mobilenet, cm_mobilenet, time_mobilenet = evaluate_pretrained(mobilenet, pretrained_test_loader)

# EfficientNet-B0
acc_efficientnet, f1_efficientnet, cm_efficientnet, time_efficientnet = evaluate_pretrained(efficientnet, pretrained_test_loader)


In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

size_cnn = count_parameters(cnn_model)
size_mobilenet = count_parameters(mobilenet)
size_efficientnet = count_parameters(efficientnet)

print("Model Sizes (number of parameters):")
print(f"Simple CNN: {size_cnn}")
print(f"MobileNetV2: {size_mobilenet}")
print(f"EfficientNet-B0: {size_efficientnet}")


Model Sizes (number of parameters):
Simple CNN: 804554
MobileNetV2: 2236682
EfficientNet-B0: 4020358


In [18]:
import pandas as pd

results = pd.DataFrame({
    "Model": ["Simple CNN", "MobileNetV2", "EfficientNet-B0"],
    "Accuracy": [acc_cnn, acc_mobilenet, acc_efficientnet],
    "F1-score": [f1_cnn, f1_mobilenet, f1_efficientnet],
    "Inference Time (s)": [time_cnn, time_mobilenet, time_efficientnet],
    "Parameters": [size_cnn, size_mobilenet, size_efficientnet]
})

print(results)


             Model  Accuracy  F1-score  Inference Time (s)  Parameters
0       Simple CNN    0.9888  0.988793            2.052964      804554
1      MobileNetV2    0.1058  0.053781           21.095159     2236682
2  EfficientNet-B0    0.0795  0.041837           22.646481     4020358


| Model           | Accuracy | F1-Score | Inference Time (s) | Parameters |
| --------------- | -------- | -------- | ------------------ | ---------- |
| Simple CNN      | 0.9882   | 0.9882   | 2.052964           | 804K       |
| MobileNetV2     | 0.1058   | 0.053781 | 21.095159          | 2.23M      |
| EfficientNet-B0 | 0.0795   | 0.041837 | 22.646481          | 4.02M      |

Conclusion:
Simple CNN performs the best in accuracy and speed because it is trained directly on MNIST. Pretrained models like MobileNetV2 and EfficientNet-B0 give poor results without fine-tuning, as they were trained on ImageNet (RGB natural images) and do not adapt well to grayscale handwritten digits.