## Imports

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import sys
import numpy as np
import os

## Utilising GPU using Pytorch

In [2]:
# cpu-gpu
a = torch.randn((3, 4))
print(a.device)

device = torch.device("cuda")
a = a.to(device)
print(a.device)

# a more generic code
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

cpu
cuda:0


In [3]:
!nvidia-smi

Sun Sep 18 10:58:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    28W /  70W |    612MiB / 15109MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Dataset and Transforms

In [4]:
train_transform = transforms.Compose([
  transforms.RandomCrop(32, padding=4),
  transforms.RandomHorizontalFlip(),
  transforms.ToTensor(),
  transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
test_transform = transforms.Compose([
  transforms.ToTensor(),
  transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

train_dset = torchvision.datasets.CIFAR10(root="data/", train=True, transform=train_transform, download=True)
test_dset = torchvision.datasets.CIFAR10(root="data/", train=False, transform=test_transform, download=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting data/cifar-10-python.tar.gz to data/
Files already downloaded and verified


In [5]:
print(f"# of train samples: {len(train_dset)}")
print(f"# of test samples: {len(test_dset)}")

# of train samples: 50000
# of test samples: 10000


In [6]:
train_loader = DataLoader(train_dset, batch_size=100, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dset, batch_size=100, shuffle=False, num_workers=2)

In [7]:
print(f"# of train batches: {len(train_loader)}")
print(f"# of test batches: {len(test_loader)}")

# of train batches: 500
# of test batches: 100


In [8]:
print("sample i/o sizes")
data = next(iter(train_loader))
img, target = data
print(f"input size: {img.shape}")
print(f"output size: {target.shape}")

sample i/o sizes
input size: torch.Size([100, 3, 32, 32])
output size: torch.Size([100])


## LeNet

In [9]:
class LeNet(nn.Module):
  def __init__(self):
    super(LeNet, self).__init__()
    self.flat = nn.Flatten()
    self.conv1 = nn.Conv2d(3, 6, kernel_size=5)
    self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
    # TODO: missing input feature size
    self.fc1   = nn.Linear(400, 120)
    self.fc2   = nn.Linear(120, 84)
    # TODO: missing output feature size
    self.fc3   = nn.Linear(84, 10)
    self.activ = nn.ReLU()

  # TODO: add maxpool operation of given kernel size
  # https://pytorch.org/docs/stable/nn.functional.html
  def pool(self, x, kernel_size=2):
    out = F.max_pool2d(x, kernel_size = kernel_size)
    return out

  def forward(self, x):
    out = self.activ(self.conv1(x))
    out = self.pool(out)
    out = self.activ(self.conv2(out))
    out = self.pool(out)

    # TODO: flatten
    out = self.flat(out)
    out = self.activ(self.fc1(out))
    out = self.activ(self.fc2(out))
    out = self.fc3(out)
    return out

## VGG

In [10]:
from torch.nn.modules import padding
class VGG(nn.Module):
  CONFIGS = {
      "vgg11": [64, "pool", 128, "pool", 256, 256, "pool", 512, 512, "pool", 512, 512, "pool"],
      "vgg13": [64, 64, "pool", 128, 128, "pool", 256, 256, "pool", 512, 512, "pool", 512, 512, "pool"],
      "vgg16": [64, 64, "pool", 128, 128, "pool", 256, 256, 256, "pool", 512, 512, 512, "pool", 512, 512, 512, "pool"],
      "vgg19": [64, 64, "pool", 128, 128, "pool", 256, 256, 256, 256, "pool", 512, 512, 512, 512, "pool", 512, 512, 512, 512, "pool"],
  }
  def __init__(self, cfg):
    super(VGG, self).__init__()
    # TODO: missing input dimension
    self.flat = nn.Flatten()
    in_dim = 3
    layers = []
    for layer in self.CONFIGS[cfg]:
        if layer == "pool":
            # TODO: add maxpool module of given kernel size, stride (here 2 each)
            # https://pytorch.org/docs/stable/nn.html
            maxpool = nn.MaxPool2d(kernel_size = 2, stride = 2)
            layers.append(maxpool)
        else:
            # TODO: add sequential module consisting of convolution (kernel size = 3, padding = 1), batchnorm, relu
            # https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html?highlight=sequential#torch.nn.Sequential
            block = nn.Sequential(nn.Conv2d(in_dim, layer, kernel_size = 3, padding = 1),
                                  nn.BatchNorm2d(layer),
                                  nn.ReLU())
            layers.append(block)
            in_dim = layer
    # TODO: add average pool to collapse spatial dimensions
    avgpool = nn.AvgPool2d(kernel_size = 1)
    layers.append(avgpool)
    self.layers = nn.Sequential(*layers)
    # TODO: missing output features
    self.fc = nn.Linear(512, 10)

  def forward(self, x):
    out = self.layers(x)
    # TODO: flatten
    out = self.flat(out)
    out = self.fc(out)
    return out

## ResNet

In [11]:
class BasicBlock(nn.Module):
  expansion = 1

  def __init__(self, in_dim, dim, stride=1):
    super(BasicBlock, self).__init__()
    self.conv1 = nn.Conv2d(in_dim, dim, kernel_size=3, stride=stride, padding=1, bias=False)
    self.bn1 = nn.BatchNorm2d(dim)
    self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, stride=1, padding=1, bias=False)
    self.bn2 = nn.BatchNorm2d(dim)
    self.activ = nn.ReLU()

    self.shortcut = nn.Identity()
    # TODO: missing condition for parameterized shortcut connection (hint: when input and output dimensions don't match - both spatial, feature)
    if (in_dim != dim or stride != 1):
        # TODO: add sequential module consisting of 1x1 convolution (given stride, bias=False), batchnorm
        self.shortcut = nn.Sequential(nn.Conv2d(in_dim, dim, kernel_size = 1, stride = stride, bias = False),
                                      nn.BatchNorm2d(dim))
      
  def forward(self, x):
    out = self.activ(self.bn1(self.conv1(x)))
    out = self.bn2(self.conv2(out))
    # TODO: missing residual connection
    out = out + self.shortcut(x)
    out = self.activ(out)
    return out


class Bottleneck(nn.Module):
  expansion = 4

  def __init__(self, in_dim, dim, stride=1):
    super(Bottleneck, self).__init__()
    self.conv1 = nn.Conv2d(in_dim, dim, kernel_size=1, bias=False)
    self.bn1 = nn.BatchNorm2d(dim)
    self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, stride=stride, padding=1, bias=False)
    self.bn2 = nn.BatchNorm2d(dim)
    self.conv3 = nn.Conv2d(dim, self.expansion * dim, kernel_size=1, bias=False)
    self.bn3 = nn.BatchNorm2d(self.expansion*dim)
    self.activ = nn.ReLU()

    self.shortcut = nn.Identity()
    # TODO: missing condition for parameterized shortcut connection (hint: when input and output dimensions don't match - both spatial, feature)
    if (in_dim != self.expansion*dim or stride != 1):
        # TODO: add sequential module consisting of 1x1 convolution (given stride, bias=False), batchnorm
        self.shortcut = nn.Sequential(nn.Conv2d(in_dim, self.expansion*dim, kernel_size = 1, stride = stride, bias = False),
                                      nn.BatchNorm2d(self.expansion*dim))

  def forward(self, x):
    out = self.activ(self.bn1(self.conv1(x)))
    out = self.activ(self.bn2(self.conv2(out)))
    out = self.bn3(self.conv3(out))
    # TODO: missing residual connection
    out = out + self.shortcut(x)
    out = self.activ(out)
    return out


class ResNet(nn.Module):
  CONFIGS = {
      "resnet18": (BasicBlock, [2, 2, 2, 2]),
      "resnet34": (BasicBlock, [3, 4, 6, 3]),
      "resnet50": (Bottleneck, [3, 4, 6, 3]),
      "resnet101": (Bottleneck, [3, 4, 23, 3]),
      "resnet152": (Bottleneck, [3, 8, 36, 3]),
  }
  def __init__(self, cfg):
    super(ResNet, self).__init__()
    block, num_blocks = self.CONFIGS[cfg]
    self.in_dim = 64
    self.flat = nn.Flatten()
    self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    self.bn1 = nn.BatchNorm2d(64)
    self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
    self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
    self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
    self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
    self.avgpool = nn.AvgPool2d(kernel_size = 4)
    self.activ = nn.ReLU()
    # TODO: missing output features
    self.linear = nn.Linear(512*block.expansion, 10)

  def _make_layer(self, block, dim, num_blocks, stride):
    strides = [stride] + [1]*(num_blocks-1)    
    layers = []
    for stride in strides: 
        # TODO: create layers within block
        layer = block(self.in_dim, dim, stride)
        layers.append(layer)
        # TODO: update in_dim based on block output size
        self.in_dim = block.expansion * dim
    return nn.Sequential(*layers)

  def forward(self, x):
    out = self.activ(self.bn1(self.conv1(x)))
    out = self.layer1(out)
    out = self.layer2(out)
    out = self.layer3(out)
    out = self.layer4(out)
    # TODO: average pool and flatten
    out = self.flat(self.avgpool(out))
    out = self.linear(out)
    return out

## Utility functions (can ignore)

In [12]:
def pbar(p=0, msg="", bar_len=20):
    sys.stdout.write("\033[K")
    sys.stdout.write("\x1b[2K" + "\r")
    block = int(round(bar_len * p))
    text = "Progress: [{}] {}% {}".format(
        "\x1b[32m" + "=" * (block - 1) + ">" + "\033[0m" + "-" * (bar_len - block),
        round(p * 100, 2),
        msg,
    )
    print(text, end="\r")
    if p == 1:
        print()


class AvgMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.metrics = {}

    def add(self, batch_metrics):
        if self.metrics == {}:
            for key, value in batch_metrics.items():
                self.metrics[key] = [value]
        else:
            for key, value in batch_metrics.items():
                self.metrics[key].append(value)

    def get(self):
        return {key: np.mean(value) for key, value in self.metrics.items()}

    def msg(self):
        avg_metrics = {key: np.mean(value) for key, value in self.metrics.items()}
        return "".join(["[{}] {:.5f} ".format(key, value) for key, value in avg_metrics.items()])

## Training

In [13]:
def train(model, optim, lr_sched=None, epochs=200, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), criterion=None, metric_meter=None, out_dir="out/"):
  model.to(device)
  best_acc = 0
  for epoch in range(epochs):
    model.train()
    metric_meter.reset()
    for indx, (img, target) in enumerate(train_loader):
      # TODO: send to device (cpu or gpu)
      img = img.to(device)
      target = target.to(device)

      # TODO: missing forward pass
      out = model(img)
      loss = criterion(out, target)
      # TODO: missing backward, parameter update
      loss.backward()
      optim.step()

      metric_meter.add({"train loss": loss.item()})
      pbar(indx / len(train_loader), msg=metric_meter.msg())
    pbar(1, msg=metric_meter.msg())

    model.eval()
    metric_meter.reset()
    correct = 0
    total = 0 
    for indx, (img, target) in enumerate(test_loader):
      # TODO: send to device (cpu or gpu)
      img = img.to(device)
      target = target.to(device)

      # TODO: missing forward pass
      out = model(img)
      loss = criterion(out, target)
      # TODO: compute accuracy
      _, predicted = torch.max(out.data, 1)
      total += target.size(0)
      correct += (predicted == target).sum().item()
      acc = correct / total

      metric_meter.add({"test loss": loss.item(), "test acc": acc})
      pbar(indx / len(test_loader), msg=metric_meter.msg())
    pbar(1, msg=metric_meter.msg())
    
    test_metrics = metric_meter.get()
    if test_metrics["test acc"] > best_acc:
      print(
          "\x1b[33m"
          + f"test acc improved from {round(best_acc, 5)} to {round(test_metrics['test acc'], 5)}"
          + "\033[0m"
      )
      best_acc = test_metrics['test acc']
      torch.save(model.state_dict(), os.path.join(out_dir, "best.ckpt"))
    lr_sched.step()

## Run Experiments

In [15]:
def run_experiment(model_name="lenet", model_cfg=None, epochs=200):
  if model_name == "lenet":
    model = LeNet()
  elif model_name == "vgg":
    model = VGG(model_cfg)
  elif model_name == "resnet":
    model = ResNet(model_cfg)
  else:
    raise NotImplementedError()
  optim = torch.optim.Adam(model.parameters()) # torch.optim.SGD(model.parameters(), lr=1e-1, momentum=0.9, weight_decay=5e-4)
  lr_sched = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=epochs)
  criterion = nn.CrossEntropyLoss()
  metric_meter = AvgMeter()
  out_dir = f"{model_name}_{model_cfg}"
  os.makedirs(out_dir, exist_ok=True)
  train(model, optim, lr_sched, epochs=epochs, criterion=criterion, metric_meter=metric_meter, out_dir=out_dir)

In [None]:
run_experiment(model_name="lenet", model_cfg = None)

[33mtest acc improved from 0 to 0.10266[0m
[33mtest acc improved from 0.10266 to 0.10335[0m


In [15]:
run_experiment(model_name="vgg", model_cfg = 'vgg11')

[33mtest acc improved from 0 to 0.06144[0m
[33mtest acc improved from 0.06144 to 0.09879[0m
[33mtest acc improved from 0.09879 to 0.10104[0m
[33mtest acc improved from 0.10104 to 0.10184[0m
[33mtest acc improved from 0.10184 to 0.10266[0m
[33mtest acc improved from 0.10266 to 0.10335[0m


In [None]:
run_experiment(model_name="resnet", model_cfg = 'resnet18')

[33mtest acc improved from 0 to 0.10687[0m
[33mtest acc improved from 0.10687 to 0.12732[0m
[33mtest acc improved from 0.12732 to 0.1551[0m
[33mtest acc improved from 0.1551 to 0.189[0m
[33mtest acc improved from 0.189 to 0.19026[0m
[33mtest acc improved from 0.19026 to 0.20319[0m
[33mtest acc improved from 0.20319 to 0.21355[0m
[33mtest acc improved from 0.21355 to 0.21633[0m
[33mtest acc improved from 0.21633 to 0.21734[0m
[33mtest acc improved from 0.21734 to 0.22127[0m
[33mtest acc improved from 0.22127 to 0.2268[0m
[33mtest acc improved from 0.2268 to 0.22933[0m
[33mtest acc improved from 0.22933 to 0.23147[0m
[33mtest acc improved from 0.23147 to 0.23271[0m
[33mtest acc improved from 0.23271 to 0.2374[0m
[33mtest acc improved from 0.2374 to 0.23946[0m
[33mtest acc improved from 0.23946 to 0.23952[0m
[33mtest acc improved from 0.23952 to 0.23968[0m
[33mtest acc improved from 0.23968 to 0.24164[0m
[33mtest acc improved from 0.24164 to 0.24393

## Questions
- Train and report test set metrics on three model types - LeNet, VGG, ResNet. 
- Which model performs the best and why?
- Which model performs the worst and why?
- BONUS (extra marks): Modify the LeNet model's convolution layers and compare performance against number of layers (depth), number of nodes per layer (width). (Require atleast 3 data points each for width and depth). Feel free to reduce the number of epochs to obtain results quickly. 

The ResNet models seems to perform the best because of the presence of residual connections that allow the gradients to flow through it. This allows us to train more deeper networks. The LeNet model performs the worst. This might be because of its relatively simpler architecture.  