# Import Libraries

In [1]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

# Model 1

**Target:**

Stable model test accuracy to be > 99.4% over multiple epochs. 

**Results:**
1. Parameters: **8246**
2. Best Train Accuracy: **99.42%**
3. Best Test Accuracy: **99.38%**

**Analysis:**
1. Model Architecture is tuned with less Filter size to reduce (but could not be <**8K**>) not only total Parameter also we can manage 28*28 image with less filters
2. Learning Rate is Tuned with STepLR to acheive better consistent accuracy
3. Augmentation is used like Color Jitter Random Rotaion for **+/-7%** and Normalization
4. Best part of Model 1 is : Least gap and Costinency is observed between train and test accuracy 

## Data Transformations

We first start with defining our data transformations. We need to think what our data is and how can we augment it to correct represent images which it might not see otherwise.


In [1]:
# Train Phase transformations
train_transforms = transforms.Compose([
                                       transforms.Resize((28, 28)),
                                       transforms.ColorJitter(brightness=0.10, contrast=0.1, saturation=0.10, hue=0.1),
                                       transforms.RandomRotation((-5.0, 5.0), fill=(1,)),
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.1307,), (0.3081,)) # The mean and std have to be sequences (e.g., tuples), therefore you should add a comma after the values.
                                       # Note the difference between (0.1307) and (0.1307,)
                                       ])

# Test Phase transformations
test_transforms = transforms.Compose([
                                      #  transforms.Resize((28, 28)),
                                      #  transforms.ColorJitter(brightness=0.10, contrast=0.1, saturation=0.10, hue=0.1),
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.1307,), (0.3081,))
                                       ])


NameError: name 'transforms' is not defined

# Dataset and Creating Train/Test Split

In [3]:
train = datasets.MNIST('./data', train=True, download=True, transform=train_transforms)
test = datasets.MNIST('./data', train=False, download=True, transform=test_transforms)

# Dataloader Arguments & Test/Train Dataloaders


In [4]:
SEED = 1

# CUDA?
cuda = torch.cuda.is_available()
print("CUDA Available?", cuda)

# For reproducibility
torch.manual_seed(SEED)

if cuda:
    torch.cuda.manual_seed(SEED)

# dataloader arguments - something you'll fetch these from cmdprmt
dataloader_args = dict(shuffle=True, batch_size=128, num_workers=4, pin_memory=True) if cuda else dict(shuffle=True, batch_size=64)

# train dataloader
train_loader = torch.utils.data.DataLoader(train, **dataloader_args)

# test dataloader
test_loader = torch.utils.data.DataLoader(test, **dataloader_args)

CUDA Available? True


# The model
Let's start with the model we first saw

In [7]:
import torch.nn.functional as F
dropout_value = 0.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Input Block
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=10, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(10),
            nn.Dropout(dropout_value)
        ) # output_size = 26

        # CONVOLUTION BLOCK 1
        self.convblock2 = nn.Sequential(
            nn.Conv2d(in_channels=10, out_channels=12, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(12),
            nn.Dropout(dropout_value)
        ) # output_size = 24

        # TRANSITION BLOCK 1
        self.convblock3 = nn.Sequential(
            nn.Conv2d(in_channels=12, out_channels=10, kernel_size=(1, 1), padding=0, bias=False),
        ) # output_size = 24
        self.pool1 = nn.MaxPool2d(2, 2) # output_size = 12

        # CONVOLUTION BLOCK 2
        self.convblock4 = nn.Sequential(
            nn.Conv2d(in_channels=10, out_channels=10, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(10),
            nn.Dropout(dropout_value)
        ) # output_size = 10
        self.convblock5 = nn.Sequential(
            nn.Conv2d(in_channels=10, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout_value)
        ) # output_size = 8
        self.convblock6 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout_value)
        ) # output_size = 6
        self.convblock7 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=14, kernel_size=(3, 3), padding=1, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(14),
            nn.Dropout(dropout_value)
        ) # output_size = 6

        # OUTPUT BLOCK
        self.gap = nn.Sequential(
            nn.AvgPool2d(kernel_size=6)
        ) # output_size = 1

        self.convblock8 = nn.Sequential(
            nn.Conv2d(in_channels=14, out_channels=10, kernel_size=(1, 1), padding=0, bias=False),
            # nn.BatchNorm2d(10),
            # nn.ReLU(),
            # nn.Dropout(dropout_value)
        )


        self.dropout = nn.Dropout(dropout_value)

    def forward(self, x):
        x = self.convblock1(x)
        x = self.convblock2(x)
        x = self.convblock3(x)
        x = self.pool1(x)
        x = self.convblock4(x)
        x = self.convblock5(x)
        x = self.convblock6(x)
        x = self.convblock7(x)
        x = self.gap(x)
        x = self.convblock8(x)

        x = x.view(-1, 10)
        return F.log_softmax(x, dim=-1)

# Model Params
Can't emphasize on how important viewing Model Summary is.
Unfortunately, there is no in-built model visualizer, so we have to take external help

In [8]:
!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
model = Net().to(device)
summary(model, input_size=(1, 28, 28))

cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 10, 26, 26]              90
              ReLU-2           [-1, 10, 26, 26]               0
       BatchNorm2d-3           [-1, 10, 26, 26]              20
           Dropout-4           [-1, 10, 26, 26]               0
            Conv2d-5           [-1, 12, 24, 24]           1,080
              ReLU-6           [-1, 12, 24, 24]               0
       BatchNorm2d-7           [-1, 12, 24, 24]              24
           Dropout-8           [-1, 12, 24, 24]               0
            Conv2d-9           [-1, 10, 24, 24]             120
        MaxPool2d-10           [-1, 10, 12, 12]               0
           Conv2d-11           [-1, 10, 10, 10]             900
             ReLU-12           [-1, 10, 10, 10]               0
      BatchNorm2d-13           [-1, 10, 10, 10]              20
          Dropout-14           [-1

# Training and Testing

Looking at logs can be boring, so we'll introduce **tqdm** progressbar to get cooler logs.

Let's write train and test functions

In [9]:
from tqdm import tqdm

train_losses = []
test_losses = []
train_acc = []
test_acc = []

def train(model, device, train_loader, optimizer, epoch):
  model.train()
  pbar = tqdm(train_loader)
  correct = 0
  processed = 0
  for batch_idx, (data, target) in enumerate(pbar):
    # get samples
    data, target = data.to(device), target.to(device)

    # Init
    optimizer.zero_grad()
    # In PyTorch, we need to set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes.
    # Because of this, when you start your training loop, ideally you should zero out the gradients so that you do the parameter update correctly.

    # Predict
    y_pred = model(data)

    # Calculate loss
    loss = F.nll_loss(y_pred, target)
    train_losses.append(loss)

    # Backpropagation
    loss.backward()
    optimizer.step()

    # Update pbar-tqdm

    pred = y_pred.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
    correct += pred.eq(target.view_as(pred)).sum().item()
    processed += len(data)

    pbar.set_description(desc= f'Loss={loss.item()} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}')
    train_acc.append(100*correct/processed)

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    test_acc.append(100. * correct / len(test_loader.dataset))

In [10]:
from torch.optim.lr_scheduler import StepLR
import math
model =  Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.02, momentum=0.9,nesterov=True,
                         weight_decay=1e-4)
scheduler = StepLR(optimizer, step_size=4, gamma=0.1)
#scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.02, epochs=15, steps_per_epoch=len(train_loader))
EPOCHS = 15
for epoch in range(EPOCHS):
    print("EPOCH:", epoch+1)
    train(model, device, train_loader, optimizer, epoch)
    scheduler.step()
    test(model, device, test_loader)

EPOCH: 1


Loss=0.07321923971176147 Batch_id=468 Accuracy=92.80: 100%|██████████| 469/469 [00:08<00:00, 55.83it/s]  



Test set: Average loss: 0.0540, Accuracy: 9854/10000 (98.54%)

EPOCH: 2


Loss=0.06714335829019547 Batch_id=468 Accuracy=98.35: 100%|██████████| 469/469 [00:08<00:00, 58.01it/s]  



Test set: Average loss: 0.0392, Accuracy: 9896/10000 (98.96%)

EPOCH: 3


Loss=0.030664781108498573 Batch_id=468 Accuracy=98.71: 100%|██████████| 469/469 [00:07<00:00, 59.87it/s] 



Test set: Average loss: 0.0377, Accuracy: 9890/10000 (98.90%)

EPOCH: 4


Loss=0.08309302479028702 Batch_id=468 Accuracy=98.83: 100%|██████████| 469/469 [00:07<00:00, 58.70it/s]  



Test set: Average loss: 0.0298, Accuracy: 9909/10000 (99.09%)

EPOCH: 5


Loss=0.016698695719242096 Batch_id=468 Accuracy=99.22: 100%|██████████| 469/469 [00:08<00:00, 56.40it/s]  



Test set: Average loss: 0.0228, Accuracy: 9935/10000 (99.35%)

EPOCH: 6


Loss=0.00508400984108448 Batch_id=468 Accuracy=99.32: 100%|██████████| 469/469 [00:07<00:00, 63.91it/s]   



Test set: Average loss: 0.0217, Accuracy: 9935/10000 (99.35%)

EPOCH: 7


Loss=0.02624017745256424 Batch_id=468 Accuracy=99.35: 100%|██████████| 469/469 [00:07<00:00, 63.73it/s]   



Test set: Average loss: 0.0218, Accuracy: 9934/10000 (99.34%)

EPOCH: 8


Loss=0.042860981076955795 Batch_id=468 Accuracy=99.39: 100%|██████████| 469/469 [00:07<00:00, 62.94it/s]  



Test set: Average loss: 0.0211, Accuracy: 9932/10000 (99.32%)

EPOCH: 9


Loss=0.016876691952347755 Batch_id=468 Accuracy=99.42: 100%|██████████| 469/469 [00:07<00:00, 63.28it/s]  



Test set: Average loss: 0.0208, Accuracy: 9937/10000 (99.37%)

EPOCH: 10


Loss=0.021587839350104332 Batch_id=468 Accuracy=99.42: 100%|██████████| 469/469 [00:07<00:00, 62.39it/s]  



Test set: Average loss: 0.0208, Accuracy: 9936/10000 (99.36%)

EPOCH: 11


Loss=0.012777184136211872 Batch_id=468 Accuracy=99.45: 100%|██████████| 469/469 [00:07<00:00, 63.39it/s]  



Test set: Average loss: 0.0208, Accuracy: 9936/10000 (99.36%)

EPOCH: 12


Loss=0.07280503213405609 Batch_id=468 Accuracy=99.42: 100%|██████████| 469/469 [00:07<00:00, 65.56it/s]   



Test set: Average loss: 0.0205, Accuracy: 9938/10000 (99.38%)

EPOCH: 13


Loss=0.01909305341541767 Batch_id=468 Accuracy=99.44: 100%|██████████| 469/469 [00:07<00:00, 65.87it/s]   



Test set: Average loss: 0.0208, Accuracy: 9935/10000 (99.35%)

EPOCH: 14


Loss=0.04563193395733833 Batch_id=468 Accuracy=99.42: 100%|██████████| 469/469 [00:06<00:00, 67.52it/s]   



Test set: Average loss: 0.0211, Accuracy: 9936/10000 (99.36%)

EPOCH: 15


Loss=0.012202541343867779 Batch_id=468 Accuracy=99.47: 100%|██████████| 469/469 [00:07<00:00, 64.29it/s]  



Test set: Average loss: 0.0207, Accuracy: 9937/10000 (99.37%)

