__Importing__

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Using device: cuda


__Data Preparation__

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # single channel
])

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_data, val_data = random_split(train_dataset, [50000, 10000])

# ------------------------------------------------------------------------------------------------------------------------

# transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((0.5,), (0.5,))
# ])

# train_dataset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
# test_dataset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

# train_data, val_data = random_split(train_dataset, [50000, 10000])


# ------------------------------------------------------------------------------------------------------------------------


# transform = transforms.Compose([
#     transforms.Resize((32,32)),
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomCrop(32, padding=4),
#     transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
# ])

# train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
# test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# train_data, val_data = random_split(train_dataset, [40000, 10000])


# ------------------------------------------------------------------------------------------------------------------------


# transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
# ])

# train_dataset = torchvision.datasets.SVHN(root='./data', split='train', download=True, transform=transform)
# test_dataset = torchvision.datasets.SVHN(root='./data', split='test', download=True, transform=transform)

# train_data, val_data = random_split(train_dataset, [60000, 13257])  # SVHN has ~73k train samples


# ------------------------------------------------------------------------------------------------------------------------


# transform = transforms.Compose([
#     transforms.Resize((96, 96)),
#     transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
# ])

# train_dataset = torchvision.datasets.STL10(root='./data', split='train', download=True, transform=transform)
# test_dataset = torchvision.datasets.STL10(root='./data', split='test', download=True, transform=transform)

# train_data, val_data = random_split(train_dataset, [4000, 1000])


# ------------------------------------------------------------------------------------------------------------------------

# DataLoaders
train_loader = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=2)
val_loader = DataLoader(val_data, batch_size=128, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_dataset)}")




Train: 50000, Val: 10000, Test: 10000


__Model__

In [None]:
class SimpleCNN(nn.Module):
  # conv_channels means no. of filters
    def __init__(self, in_channels = 3, num_classes=10, conv_channels=[64, 128, 256], dropout_p=0.5):
        super(SimpleCNN, self).__init__()

        # --- Convolutional feature extractor ---
        self.features = nn.Sequential(
            # in_channels out_channels  filter_size
            nn.Conv2d(in_channels, conv_channels[0], kernel_size=3, padding=1),
            nn.BatchNorm2d(conv_channels[0]),
            nn.LeakyReLU(inplace=True),
            # kernel stride
            nn.MaxPool2d(2, 2),  # 32 -> 16

            nn.Conv2d(conv_channels[0], conv_channels[1], kernel_size=3, padding=1),
            nn.BatchNorm2d(conv_channels[1]),
            nn.LeakyReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 16 -> 8

            nn.Conv2d(conv_channels[1], conv_channels[2], kernel_size=3, padding=1),
            nn.BatchNorm2d(conv_channels[2]),
            nn.LeakyReLU(inplace=True),
            nn.MaxPool2d(2, 2),   # 8 -> 4

            nn.Conv2d(conv_channels[2], 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )

        # --- Fully connected classifier ---
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
            nn.Linear(conv_channels[-1], 256),
            nn.LeakyReLU(inplace=True),
            nn.Dropout(dropout_p),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        running_loss, correct, total = 0.0, 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # resets gradients from the previous batch
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = 100 * correct / total
        val_acc = evaluate_model(model, val_loader)
        print(f"Epoch [{epoch+1}/{epochs}] - Loss: {running_loss/len(train_loader):.4f} | Train Acc: {train_acc:.2f}% | Val Acc: {val_acc:.2f}%")

def evaluate_model(model, loader):
    model.eval()
    correct, total = 0, 0
    # no gradient calculation
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    return 100 * correct / total


In [None]:
# Instantiate model
model = SimpleCNN(in_channels=1).to(device)  # For MNIST/FashionMNIST
# model = SimpleCNN(in_channels=3).to(device)  # For CIFAR-10/STL10/SVHN

print(model)

# loss function
criterion = nn.CrossEntropyLoss()
# optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

# training
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)
print("training completed \n")
# testing
test_acc = evaluate_model(model, test_loader)
print(f"Final Test Accuracy: {test_acc:.2f}%")

SimpleCNN(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.01, inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): LeakyReLU(negative_slope=0.01, inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): LeakyReLU(negative_slope=0.01, inplace=True)
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stri

MNIST / Fashion-MNIST → 28×28

CIFAR-10 / SVHN → 32×32

STL-10 → 96×96

**Optimiser** SGD

---

**mnist**  Train Acc: 99.93% Test Accuracy: 99.39%

---

**fashion mnist**  Train Acc: 97.01% Test Accuracy: 89.94%

---

**cifar10**: Train Acc: 77.59% Test Accuracy: 75.22%

---

**svhn**  Train Acc: 96.32% Test Accuracy: 91.29%


---

**stl** Train Acc: 56.38  Test Accuracy: 46.46%


----

**activation**: LeakyReLU

---

**mnist**  Train Acc: 99.70% Test Accuracy: 99.05%

---

**fashion mnist**  Train Acc: 97.64% Test Accuracy: 92.29%

---

**cifar10**: Train Acc: 81.25 Test Accuracy: 75.43%

---

**svhn**   Train Acc: 97.10%   Test Accuracy: 92.46%

---

**stl** Train Acc: 61.65  Test Accuracy: 50.00%


----

**Extra Convolution Layer**

---

**mnist**  Train Acc: 99.77% Test Accuracy: 98.81%

---

**fashion mnist**  Train Acc: 97.52%   Test Accuracy: 91.93%

---

**cifar10**: Train Acc: 81.26% Test Accuracy: 78.78%

---

**svhn**  Train Acc: 96.71% Test Accuracy: 93.17%

---

**stl** Train Acc: 61.02% Test Accuracy: 47.94%


----

**Conv channels [64, 128, 256]**

---

**mnist** Train Acc: 99.44% Test Accuracy: 99.20%


---

**fashion mnist** Train Acc: 94.42% Test Accuracy: 91.91%

---

**cifar10**: Train Acc: 70.70%  Test Accuracy: 64.46%

---

**svhn**  Train Acc: 89.96% Test Accuracy: 89.14%

---

**stl**  Train Acc: 52.23% Test Accuracy: 48.16%

---
**conv_channels=[32, 64, 128]**

**mnist**  Train Acc: 99.49  Test Accuracy: 98.58%

---

**fashion mnist** Train Acc: 93.30  Test Accuracy: 89.43%

---

**cifar10**: Train Acc: 68.72%  Test Accuracy: 66.11%

---

**svhn**  Train Acc: 88.61    Test Accuracy: 91.84%

---

**stl** Train Acc: 50.33% Test Accuracy: 47.21%


| **Configuration**                | **MNIST (Train / Test)** | **Fashion MNIST (Train / Test)** | **CIFAR-10 (Train / Test)** | **SVHN (Train / Test)** | **STL (Train / Test)** |
| -------------------------------- | ------------------------ | -------------------------------- | --------------------------- | ----------------------- | ---------------------- |
| **Optimizer: SGD**               | 99.93% / 99.39%          | 97.01% / 89.94%                  | 77.59% / 75.22%             | 96.32% / 91.29%         | 56.38% / 46.46%        |
| **Activation: LeakyReLU**        | 99.70% / 99.05%          | 97.64% / 92.29%                  | 81.25% / 75.43%             | 97.10% / 92.46%         | 61.65% / 50.00%        |
| **Extra Convolution Layer**      | 99.77% / 98.81%          | 97.52% / 91.93%                  | 81.26% / 78.78%             | 96.71% / 93.17%         | 61.02% / 47.94%        |
| **Conv Channels [64, 128, 256]** | 99.44% / 99.20%          | 94.42% / 91.91%                  | 70.70% / 64.46%             | 89.96% / 89.14%         | 52.23% / 48.16%        |
| **Conv Channels [32, 64, 128]**  | 99.49% / 98.58%          | 93.30% / 89.43%                  | 68.72% / 66.11%             | 88.61% / 91.84%         | 50.33% / 47.21%        |
