# Searching for Best Number of Blocks Per Layer

After seeing the results of our mofified ResNet with residual blocks [1,1,1,1], we wanted to investigate if we could increase the number of blocks per layer for a noticeable increase in performance. So we trained an additioanl five architectures with the following blocks per layer:  [2,1,1,1], [1,2,1,1], [3,1,1,1], [2,2,1,1], and [1,1,2,1]. Some of these have less than 5M parameters and some have more, but we would have tried other techniques to reduce the number of parameters if the accuracy significantly improved with these mdoels.

Ultimately we chose to run further experimentation with [2,1,1,1] since it had less than 5M parameters, had comparable test accuracy to the other highest performing models in this notebook, and it appeared from looking at the loss values that we could continue to train and fine-tune after this initial 30 epochs to continue improving the accuracy.

In [1]:
!pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [2]:
import torch
import torchvision.transforms as transforms
from torchvision import datasets
import pickle
from PIL import Image
import torchsummary
import warnings
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



## Load the data

In [3]:
class TestData(torch.utils.data.Dataset):
    def __init__(self, file_path, transform=None):
        self.data = None
        with open(file_path, "rb") as f:
            self.data = pickle.load(f)
            self.images = self.data[b"data"]
            self.ids = self.data[b"ids"]
        self.transform = transform

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, index):
        img = self.images[index]
        id_ = self.ids[index]

        # Convert image to PIL Image
        img = img.reshape(3, 32, 32).transpose(1, 2, 0)
        img = Image.fromarray(img)

        if self.transform is not None:
            img = self.transform(img)

        return id_, img


In [4]:
def augment_data(input_dim=(3, 32, 32)):
    transform_train = transforms.Compose(
        [
            # transforms.RandomVerticalFlip(),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(
                brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1
            ),
            transforms.RandomCrop(input_dim[1], padding=4),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ]
    )

    transform_val_test = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ]
    )
    return transform_train, transform_val_test

In [5]:
# def load_data(input_dim=(3, 32, 32)):
#     import os

#     data_directory = os.path.dirname(__file__) + "/../data"
#     test_path = os.path.join(data_directory, "testdata", "cifar_test_nolabels.pkl")
data_directory = "./data"
input_dim = (3,32,32)
test_path = "/kaggle/input/test-set/cifar_test_nolabels.pkl"

transform_train, transform_val_test = augment_data(input_dim)

trainset = datasets.CIFAR10(
    root=data_directory, train=True, download=True, transform=transform_train
)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=2
)

val_set = datasets.CIFAR10(
    root=data_directory, train=False, download=True, transform=transform_val_test
)
val_loader = torch.utils.data.DataLoader(
    val_set, batch_size=100, shuffle=False, num_workers=2
)

test_set = TestData(file_path=test_path, transform=transform_val_test)
testloader = torch.utils.data.DataLoader(
    test_set, batch_size=1, shuffle=False, num_workers=2
)

classes = (
    "plane",
    "car",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
)

#     return trainloader, val_loader, testloader, classes


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:02<00:00, 84125003.14it/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


# define model

In [8]:
class ResidualBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
        )
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(
            planes, planes, kernel_size=3, stride=1, padding=1, bias=False
        )
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(
                    in_planes,
                    self.expansion * planes,
                    kernel_size=1,
                    stride=stride,
                    bias=False,
                ),
                nn.BatchNorm2d(self.expansion * planes),
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

In [9]:
class MiniResNet(nn.Module):
    def __init__(self, num_blocks=(2, 2, 2, 2)):
        super(MiniResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512, 10)
#         self.linear = nn.Linear(256, 10)

    def _make_layer(self, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(ResidualBlock(self.in_planes, planes, stride))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

In [10]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float("inf")

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False


def get_optimizers(model):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
    early_stopper = EarlyStopper(patience=10, min_delta=10)

    return criterion, optimizer, scheduler, early_stopper

In [86]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Define Train, Test, Infer functions

In [18]:
import torch.backends.cudnn as cudnn

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [25]:
device

'cuda'

In [20]:
epochs = 30

In [17]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    for _, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    return train_loss, correct, total


In [22]:
def train(
    model,
    train_loader,
    test_loader,
    epochs,
    criterion,
    optimizer,
    scheduler,
    early_stopper,
    device,
):
    train_loss_history = []
    train_acc_history = []
    test_loss_history = []
    test_acc_history = []

    for epoch in range(epochs):
        train_loss, train_correct, train_total = train_epoch(
            model, train_loader, criterion, optimizer, device
        )
        test_loss, test_correct, test_total = test(
            model, test_loader, criterion, device
        )

        train_loss = train_loss / len(train_loader)
        test_loss = test_loss / len(test_loader)

        train_acc = train_correct / train_total
        test_acc = test_correct / test_total

        train_loss_history += [train_loss]
        test_loss_history += [test_loss]

        train_acc_history.append(train_acc)
        test_acc_history.append(test_acc)

        print(
            f"Epoch {epoch + 1}, Train loss {train_loss:.3f}, Test loss {test_loss:.3f}, Train Accuracy: {train_acc:.3f}, Test Accuracy: {test_acc:.3f}"
        )
        scheduler.step()

        if (epoch % 10 == 0) or early_stopper.early_stop(test_loss):
            state = {
                "epoch": epoch,
                "state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "loss": test_loss,
            }
            if not os.path.isdir("checkpoint"):
                os.mkdir("checkpoint")
                torch.save(state, "./checkpoint/ckpt.pth")

In [18]:
def test(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for _, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    return test_loss, correct, total

In [36]:
def infer(model, test_loader, criterion, device):
    print(len(test_loader))
    model.eval()

    results = []

    for _, (id_, image) in enumerate(test_loader):
        image = image.to(device)
        output = model(image)
        _, predicted = output.max(1)
        results.append({"ID": id_.item(), "Labels": predicted.item()})

    return pd.DataFrame(results)

# Sweep to test different numbers of Residual Blocks per layer

In [47]:
import numpy as np

In [90]:
num_blocks_list = np.array([[2,1,1,1],[1,2,1,1],[3,1,1,1],[2,2,1,1],[1,1,2,1]])
model_names = ['MiniResNet2111','MiniResNet1211','MiniResNet3111','MiniResNet2211','MiniResNet1121']
len(num_blocks_list)

5

In [92]:
for idx, num_blocks in enumerate(num_blocks_list):
    print("/kaggle/working/checkpoint/"+model_names[idx]+"ckpt.pth")

/kaggle/working/checkpoint/MiniResNet2111ckpt.pth
/kaggle/working/checkpoint/MiniResNet1211ckpt.pth
/kaggle/working/checkpoint/MiniResNet3111ckpt.pth
/kaggle/working/checkpoint/MiniResNet2211ckpt.pth
/kaggle/working/checkpoint/MiniResNet1121ckpt.pth


In [93]:
def train_logging(
    model,
    train_loader,
    test_loader,
    epochs,
    criterion,
    optimizer,
    scheduler,
    early_stopper,
    device,
    model_names
):
    train_loss_history = []
    train_acc_history = []
    test_loss_history = []
    test_acc_history = []

    for epoch in range(epochs):
        train_loss, train_correct, train_total = train_epoch(
            model, train_loader, criterion, optimizer, device
        )
        test_loss, test_correct, test_total = test(
            model, test_loader, criterion, device
        )

        train_loss = train_loss / len(train_loader)
        test_loss = test_loss / len(test_loader)

        train_acc = train_correct / train_total
        test_acc = test_correct / test_total

        train_loss_history += [train_loss]
        test_loss_history += [test_loss]

        train_acc_history.append(train_acc)
        test_acc_history.append(test_acc)

        print(
            f"Epoch {epoch + 1}, Train loss {train_loss:.3f}, Test loss {test_loss:.3f}, Train Accuracy: {train_acc:.3f}, Test Accuracy: {test_acc:.3f}"
        )
        scheduler.step()

        if (epoch % 10 == 0) or early_stopper.early_stop(test_loss):
            state = {
                "epoch": epoch,
                "state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "loss": test_loss,
            }
            if not os.path.isdir("checkpoint"):
                os.mkdir("checkpoint")
                torch.save(state, "/kaggle/working/checkpoint/"+model_names[idx]+"ckpt.pth")
    return train_loss_history,train_acc_history,test_loss_history,test_acc_history 

In [94]:
val_accuracy_hist_list = []
train_accuracy_hist_list = []
val_loss_hist_list = []
train_loss_hist_list= []

device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 30

for idx, num_blocks in enumerate(num_blocks_list):
    model = MiniResNet(num_blocks=num_blocks).cuda()
    criterion, optimizer, scheduler, early_stopper = get_optimizers(model)
    num_blocks_string = np.array2string(num_blocks)
    
    print(
            f"Training model with num_blocks=: {num_blocks_string} and total trainable parameters: {count_parameters(model):d} "
        )
    
    train_loss_history,train_acc_history,test_loss_history,test_acc_history = train_logging(model,trainloader,
                                                                                            val_loader,epochs,criterion,optimizer,
                                                                                            scheduler,early_stopper,device,model_names)
    
    val_accuracy_hist_list.append(test_acc_history)
    train_accuracy_hist_list.append(train_acc_history)
    val_loss_hist_list.append(test_loss_history)
    train_loss_hist_list.append(train_loss_history)
    
    valid_loss, valid_correct, valid_total = test(model, val_loader, criterion, device)

    final_val_acc = valid_correct / valid_total
    
    print(
            f"Evaluating final validation accuracy: {final_val_acc:.3f}"
        )
    

Training model with num_blocks=: [2 1 1 1] and total trainable parameters: 4977226 
Epoch 1, Train loss 1.475, Test loss 1.410, Train Accuracy: 0.457, Test Accuracy: 0.534
Epoch 2, Train loss 1.015, Test loss 1.056, Train Accuracy: 0.638, Test Accuracy: 0.653
Epoch 3, Train loss 0.816, Test loss 0.865, Train Accuracy: 0.713, Test Accuracy: 0.708
Epoch 4, Train loss 0.692, Test loss 0.752, Train Accuracy: 0.756, Test Accuracy: 0.744
Epoch 5, Train loss 0.609, Test loss 0.633, Train Accuracy: 0.788, Test Accuracy: 0.791
Epoch 6, Train loss 0.554, Test loss 0.612, Train Accuracy: 0.807, Test Accuracy: 0.803
Epoch 7, Train loss 0.508, Test loss 0.632, Train Accuracy: 0.824, Test Accuracy: 0.783
Epoch 8, Train loss 0.472, Test loss 0.643, Train Accuracy: 0.835, Test Accuracy: 0.794
Epoch 9, Train loss 0.440, Test loss 0.521, Train Accuracy: 0.848, Test Accuracy: 0.825
Epoch 10, Train loss 0.417, Test loss 0.498, Train Accuracy: 0.856, Test Accuracy: 0.826
Epoch 11, Train loss 0.390, Test lo