In [4]:
# imports:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from PIL import ImageFont


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# Data Downloading + Preparation
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


# Downloading train dataset and test dataset
trainset = datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train)
train_loader = torch.utils.data.DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=2)

testset = datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test)
test_loader = torch.utils.data.DataLoader(
    testset, batch_size=100, shuffle=False, num_workers=2)

==> Preparing data..
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:05<00:00, 30524593.36it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [6]:
# Residual building block that we will use later on in the larger network
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride = 1, downsample = None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Sequential(
                        nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride = stride, padding = 1),
                        nn.BatchNorm2d(out_channels),
                        nn.ReLU())
        self.conv2 = nn.Sequential(
                        nn.Conv2d(out_channels, out_channels, kernel_size = 3, stride = 1, padding = 1),
                        nn.BatchNorm2d(out_channels))
        self.downsample = downsample
        self.relu = nn.ReLU()
        self.out_channels = out_channels
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual # skip connection
        out = self.relu(out)
        return out

In [7]:
## defining our model ##
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes = 10):
        super(ResNet, self).__init__()
        self.inplanes = 32
        self.conv1 = nn.Sequential(
                        nn.Conv2d(3, 32, kernel_size = 3, stride = 1, padding = 1),
                        nn.BatchNorm2d(32),
                        nn.ReLU())
        
        ### 1st attempt ###

        #self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 1, padding = 1)
        # self.layer0 = self._make_layer(block, 32, layers[0], stride = 1)
        # self.layer1 = self._make_layer(block, 64, layers[1], stride = 2)
        # self.layer2 = self._make_layer(block, 128, layers[2], stride = 2)
        # self.layer3 = self._make_layer(block, 256, layers[3], stride = 2)

        ### 1st attempt ###

        ### 2nd attempt ###
        # starting wide and narrowing down
        self.layer0 = self._make_layer(block, 256, layers[0], stride = 1)
        self.layer1 = self._make_layer(block, 128, layers[1], stride = 2)
        self.layer2 = self._make_layer(block, 64, layers[2], stride = 2)
        self.layer3 = self._make_layer(block, 256, layers[3], stride = 2)

        ### 2nd attempt ###

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)
        
    def _make_layer(self, block, planes, blocks, stride=1):
        """
          this function will help us construct residual blocks
        """
        downsample = None
        if stride != 1 or self.inplanes != planes:
            
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

In [8]:
## set hyperparameters
num_classes = 10
num_epochs = 100
batch_size = 16
learning_rate = 0.1

# start an instance of the model
model = ResNet(ResidualBlock, [2, 2, 2, 2]).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 5e-4, momentum = 0.9)  
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

total_step = len(train_loader)

# taking a look at the model
print(model)

ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (layer0): Sequential(
    (0): ResidualBlock(
      (conv1): Sequential(
        (0): Conv2d(32, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (conv2): Sequential(
        (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (downsample): Sequential(
        (0): Conv2d(32, 256, kernel_size=(1, 1), stride=(1, 1))
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU()
    )
    (1): ResidualBlock(
      (conv1): Sequential(
        (0): Conv2d(256

In [9]:
# check if the number of parameters is below 5M
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)
  
print("Number of trainable parameters",count_parameters(model))

Number of trainable parameters 4761354


In [10]:
## Actual training process:
import gc
total_step = len(train_loader)
best_loss = float('inf')

for epoch in range(num_epochs):
    epoch_loss = 0
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    
    scheduler.step()
    # storing the best model so far
    if epoch_loss < best_loss:
      best_loss = epoch_loss
      torch.save(model.state_dict(), 'model.pt')
      print("Found a better model")

    print('Epoch [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, epoch_loss/len(train_loader)))
    
    

Found a better model
Epoch [1/100], Loss: 1.7029
Found a better model
Epoch [2/100], Loss: 1.1979
Found a better model
Epoch [3/100], Loss: 0.9357
Found a better model
Epoch [4/100], Loss: 0.7810
Found a better model
Epoch [5/100], Loss: 0.6712
Found a better model
Epoch [6/100], Loss: 0.6100
Found a better model
Epoch [7/100], Loss: 0.5650
Found a better model
Epoch [8/100], Loss: 0.5389
Found a better model
Epoch [9/100], Loss: 0.5137
Found a better model
Epoch [10/100], Loss: 0.4901
Found a better model
Epoch [11/100], Loss: 0.4813
Found a better model
Epoch [12/100], Loss: 0.4680
Found a better model
Epoch [13/100], Loss: 0.4485
Found a better model
Epoch [14/100], Loss: 0.4446
Found a better model
Epoch [15/100], Loss: 0.4314
Found a better model
Epoch [16/100], Loss: 0.4239
Found a better model
Epoch [17/100], Loss: 0.4138
Found a better model
Epoch [18/100], Loss: 0.4092
Found a better model
Epoch [19/100], Loss: 0.4006
Found a better model
Epoch [20/100], Loss: 0.3926
Found a b

In [12]:
## Evaluation

model.load_state_dict(torch.load('model_test.pt')) # load the best performing model
model = model.to(device)

with torch.no_grad():
      correct = 0
      total = 0
      for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        #del images, labels, outputs

      print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total)) 

Accuracy of the network on the 10000 test images: 93.04 %
