## Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os, torch, torchvision, tarfile, itertools, matplotlib, json
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import torchvision.transforms as tt
from torchvision.datasets.utils import download_url
from torchvision.datasets import ImageFolder, CIFAR100
from torch.utils.data import DataLoader, random_split, SubsetRandomSampler
from torchvision.utils import make_grid

In [None]:
LOG_PATH = '/content/drive/Othercomputers/My Mac/School/Robotics'

## Dataset Preparation

In [4]:
# Dowload the dataset
dataset_url = "https://s3.amazonaws.com/fast-ai-imageclas/cifar100.tgz"
download_url(dataset_url, '.')

# Extract dataset
with tarfile.open('./cifar100.tgz', 'r:gz') as tar:
   tar.extractall(path='./data')

Downloading https://s3.amazonaws.com/fast-ai-imageclas/cifar100.tgz to ./cifar100.tgz


100%|██████████| 169168619/169168619 [00:04<00:00, 38460062.45it/s]


In [5]:
# Data transforms (normalization & data augmentation)
stats = ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
train_tfms = tt.Compose([tt.RandomCrop(32, padding=4, padding_mode='reflect'),
                         tt.RandomHorizontalFlip(),
                         tt.ToTensor(),
                         tt.Normalize(*stats,inplace=True)])
valid_tfms = tt.Compose([tt.ToTensor(), tt.Normalize(*stats)])
test_tfms = tt.Compose([tt.ToTensor(), tt.Normalize(*stats)])

In [6]:
# PyTorch datasets
train_ds = CIFAR100(root = 'data/', download = True, train = True, transform = train_tfms)
valid_ds = CIFAR100(root = 'data/', download = True, train = True, transform = valid_tfms)
test_ds  = CIFAR100(root = 'data/', train=False, transform = test_tfms)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to data/cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [00:04<00:00, 40649739.14it/s]


Extracting data/cifar-100-python.tar.gz to data/
Files already downloaded and verified


In [7]:
# Train and validation splits (samplers)
num_train = len(train_ds)
indices = list(range(num_train))
split = int(np.floor(0.2 * num_train))

np.random.seed(42)
np.random.shuffle(indices)

train_idx, valid_idx = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

In [8]:
# PyTorch data loaders
def prepare_dataloader(train_ds, valid_ds, batch_size, device):
  train_dl = DataLoader(train_ds, batch_size, num_workers=2, pin_memory=True, sampler=train_sampler)
  valid_dl = DataLoader(valid_ds, batch_size*2, num_workers=2, pin_memory=True, sampler=valid_sampler)

  train_dl = DeviceDataLoader(train_dl, device)
  valid_dl = DeviceDataLoader(valid_dl, device)

  return train_dl, valid_dl

## Training functions

In [9]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [10]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

class ImageClassificationBase(nn.Module):
    def training_step(self, batch):
        images, labels = batch
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss

    def validation_step(self, batch):
        images, labels = batch
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}

    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}

    def epoch_end(self, epoch, result):
        print("Epoch [{}], last_lr: {:.5f}, train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch, result['lrs'][-1], result['train_loss'], result['val_loss'], result['val_acc']))

In [11]:
# call model eval before doing any evaluation - good practice
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def fit_one_cycle(epochs, max_lr, model, train_loader, val_loader,
                  weight_decay=0, grad_clip=None, opt_func=torch.optim.SGD):
    torch.cuda.empty_cache()
    history = []

    # Set up cutom optimizer with weight decay
    optimizer = opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
    # Set up one-cycle learning rate scheduler
    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs,
                                                steps_per_epoch=len(train_loader))

    for epoch in range(epochs):
        # Training Phase
        model.train()
        train_losses = []
        lrs = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()

            # Gradient clipping
            if grad_clip:
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)

            optimizer.step()
            optimizer.zero_grad()

            # Record & update learning rate
            lrs.append(get_lr(optimizer))
            sched.step()

        # Validation phase
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        result['lrs'] = lrs
        model.epoch_end(epoch, result)
        history.append(result)
    return history

## Model (ResNet9) preparation

In [12]:
def conv_block(in_channels, out_channels, pool=False):
    layers = [nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
              nn.BatchNorm2d(out_channels),
              nn.ReLU(inplace=True)]
    if pool: layers.append(nn.MaxPool2d(2))
    return nn.Sequential(*layers)

class ResNet9(ImageClassificationBase):
    def __init__(self, in_channels, num_classes):
        super().__init__()
        # 3 x 32 x 32
        self.conv1 = conv_block(in_channels, 64)         # 64 x 32 x 32
        self.conv2 = conv_block(64, 128, pool=True)      # 128 x 16 x 16
        self.res1 = nn.Sequential(conv_block(128, 128),
                                  conv_block(128, 128))  # 128 x 16 x 16

        self.conv3 = conv_block(128, 256, pool=True)    # 256 x 8 x 8
        self.conv4 = conv_block(256, 512, pool=True)    # 512 x 4 x 4
        self.res2 = nn.Sequential(conv_block(512, 512),
                                  conv_block(512, 512))  # 512 x 4 x 4

        self.classifier = nn.Sequential(nn.MaxPool2d(4), # 512 x 1 x 1
                                        nn.Flatten(),     # 512
                                        nn.Dropout(0.2),
                                        nn.Linear(512, num_classes)) # 100

    def forward(self, xb):
        out1 = self.conv1(xb)
        out2 = self.conv2(out1)
        out3 = self.res1(out2) + out2
        out4 = self.conv3(out3)
        out5 = self.conv4(out4)
        out6 = self.res2(out5) + out5
        out = self.classifier(out6)
        return out

## Hyperparameter selection

In [13]:
# Use CUDA (GPU) if available
device = get_default_device()
device

device(type='cuda')

In [14]:
# Hyperparameters recommended in original paper
HYPERPARAMETERS = {
    'epochs': [50, 100, 150],
    'weight_decay': [1e-4],
    'max_learning_rate': [0.005, 0.01, 0.02],
    'gradient_clip': [0.1],
    'batch_size': [256, 384, 512],
}

# Get all possible combination of hyperparameter sets (for grid search tuning)
keys, values = zip(*HYPERPARAMETERS.items())
HYPERPARAMETERS_COMB = [dict(zip(keys, v)) for v in itertools.product(*values)]
print(len(HYPERPARAMETERS_COMB)) # length = 27

27


In [16]:
hyperparams_scores = {}

In [17]:
# Do hyperparameter tuning and save logs to file
with open(f'{LOG_PATH}/hyperparameter_selection.txt', 'a+') as f_log:

  # Go through all possible sets of hyperparameters
  for hyperparams in HYPERPARAMETERS_COMB:

    # Set hyperparameters
    epochs, weight_decay, max_lr, grad_clip, batch_size = hyperparams['epochs'], \
      hyperparams['weight_decay'], hyperparams['max_learning_rate'], hyperparams['gradient_clip'], hyperparams['batch_size']

    # Get training and validation data splits
    train_dl, valid_dl = prepare_dataloader(train_ds, valid_ds, batch_size=batch_size, device=device)

    # Train model and evaluate
    f_log.write(f'Epochs: {epochs}, Max LR: {max_lr}, Grad Clip: {grad_clip}, Weight Decay: {weight_decay}, Optimizer: Adam.\n')
    model   = to_device(ResNet9(3, 100), device)
    history = fit_one_cycle(epochs, max_lr, model, train_dl, valid_dl,
                            grad_clip=grad_clip,
                            weight_decay=weight_decay,
                            opt_func=torch.optim.Adam)

    # Reduce history size
    for epoch in range(len(history)):
      history[epoch]['lrs'] = history[epoch]['lrs'][::25]

    # Save history to log file
    f_log.write(json.dumps(history, indent=2, separators=(',', ': ')).replace('],', '],\n'))
    f_log.write('\n\n\n')

    # Save score in dict
    hyperparams_scores[f'{epochs}_{max_lr}_{grad_clip}_{weight_decay}'] = history

## Final Training & Evaluation

In [21]:
# Best hyperparameters from above
epochs, weight_decay, max_lr, grad_clip, batch_size = 50, 1e-4, 0.005, 0.1, 256

# Prepare entire training dataset, i.e., combined train and validation data
train_dl = DataLoader(train_ds, batch_size, shuffle=True, pin_memory=True)
train_dl = DeviceDataLoader(train_dl, device)

# Get official testing data
test_dl  = DataLoader(test_ds, batch_size*2, pin_memory=True)
test_dl = DeviceDataLoader(test_dl, device)

# Train the model on optimal hyperparameters
model   = to_device(ResNet9(3, 100), device)
history = fit_one_cycle(epochs, max_lr, model, train_dl, test_dl,
                        grad_clip=grad_clip,
                        weight_decay=weight_decay,
                        opt_func=torch.optim.Adam)

# Save final training and testing logs
with open(f'{LOG_PATH}/hyperparameter_selection_final.txt', 'w') as f_log:
  f_log.write(json.dumps(history, indent=2, separators=(',', ': ')).replace('],', '],\n'))

Epoch [0], last_lr: 0.00025, train_loss: 3.8838, val_loss: 3.2375, val_acc: 0.2187
Epoch [1], last_lr: 0.00041, train_loss: 3.1186, val_loss: 2.7381, val_acc: 0.3070
Epoch [2], last_lr: 0.00066, train_loss: 2.6670, val_loss: 2.5488, val_acc: 0.3564
Epoch [3], last_lr: 0.00099, train_loss: 2.3721, val_loss: 2.3741, val_acc: 0.3867
Epoch [4], last_lr: 0.00140, train_loss: 2.1548, val_loss: 2.0940, val_acc: 0.4450
Epoch [5], last_lr: 0.00186, train_loss: 1.9884, val_loss: 2.0853, val_acc: 0.4555
Epoch [6], last_lr: 0.00235, train_loss: 1.8283, val_loss: 2.0739, val_acc: 0.4729
Epoch [7], last_lr: 0.00285, train_loss: 1.6965, val_loss: 1.8550, val_acc: 0.5026
Epoch [8], last_lr: 0.00334, train_loss: 1.5720, val_loss: 1.8497, val_acc: 0.4990
Epoch [9], last_lr: 0.00380, train_loss: 1.4751, val_loss: 1.6644, val_acc: 0.5393
Epoch [10], last_lr: 0.00421, train_loss: 1.3964, val_loss: 1.7697, val_acc: 0.5295
Epoch [11], last_lr: 0.00454, train_loss: 1.3438, val_loss: 1.6846, val_acc: 0.5417
Ep