In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import torch
import numpy as np
import time
import tqdm
from typing import Tuple
import torch.nn as nn
import torchvision.models as  models
from torch.utils.data import DataLoader
import torchvision.datasets.voc as voc
import torch.optim as optim
from torchvision import transforms
from torchvision.models import resnet18
import torch.utils.model_zoo as model_zoo

In [3]:
data_dir = '/content/drive/MyDrive/CS 444: DL for CV/Project/data/'
ckpt_dir = '/content/drive/MyDrive/CS 444: DL for CV/Project/checkpoints/'
object_categories = ['aeroplane', 'bicycle', 'bird', 'boat',
                     'bottle', 'bus', 'car', 'cat', 'chair',
                     'cow', 'diningtable', 'dog', 'horse',
                     'motorbike', 'person', 'pottedplant',
                     'sheep', 'sofa', 'train', 'tvmonitor']
num_classes = len(object_categories)
batch_size = 32
resnet_lr = 1e-5
fc_lr = 5e-3
num_epochs = 25

mean = [0.457342265910642, 0.4387686270106377, 0.4073427106250871]
std = [0.26753769276329037, 0.2638145880487105, 0.2776826934044154]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
np.random.seed(1902)
torch.manual_seed(1902)

<torch._C.Generator at 0x7fbd304903b0>

## Data Pipeline

Download the PASCAL VOC dataset and create train and val data loaders.

In [4]:
class PascalVOC_Dataset(voc.VOCDetection):
    """Pascal VOC Detection Dataset"""
    def __init__(self, root, image_set='train', download=False, transform=None, target_transform=None):
        super().__init__(root, image_set=image_set, download=download, transform=transform, target_transform=target_transform)
    
    def __getitem__(self, index):
        return super().__getitem__(index)
    
    def __len__(self):
        return len(self.images)

In [5]:
def encode_labels(target):
    """Encode multiple labels using 1/0 encoding"""
    ls = target['annotation']['object']
    j = []
    if type(ls) == dict:
        if int(ls['difficult']) == 0:
            j.append(object_categories.index(ls['name']))
    else:
        for i in range(len(ls)):
            if int(ls[i]['difficult']) == 0:
                j.append(object_categories.index(ls[i]['name']))
    k = np.zeros(len(object_categories))
    k[j] = 1
    return torch.from_numpy(k)

In [6]:
transformations = transforms.Compose([transforms.Resize((300, 300)),
                                      transforms.ToTensor()])
transformations_valid = transforms.Compose([transforms.Resize(330), 
                                            transforms.CenterCrop(300), 
                                            transforms.ToTensor()])

In [7]:
dataset_train = PascalVOC_Dataset(data_dir,
                                  image_set='train', 
                                  download=False, 
                                  transform=transformations, 
                                  target_transform=encode_labels)
train_loader = DataLoader(dataset_train, batch_size=batch_size, num_workers=2, shuffle=True)

dataset_valid = PascalVOC_Dataset(data_dir, 
                                  image_set='val', 
                                  download=False, 
                                  transform=transformations_valid, 
                                  target_transform=encode_labels)
valid_loader = DataLoader(dataset_valid, batch_size=batch_size, num_workers=2)

## Define Model

In [8]:
net = resnet18(pretrained=True)
net.avgpool = torch.nn.AdaptiveAvgPool2d(1)
num_ftrs = net.fc.in_features
net.fc = torch.nn.Linear(num_ftrs, num_classes)
net = net.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:01<00:00, 42.0MB/s]


## Define Training Parameters

In [9]:
optimizer = optim.SGD([{'params': list(net.parameters())[:-1], 'lr': resnet_lr, 'momentum': 0.9},
                       {'params': list(net.parameters())[-1], 'lr': fc_lr, 'momentum': 0.9}])
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 12, eta_min=0, last_epoch=-1)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')

In [10]:
class RandomMixup(torch.nn.Module):
    """Randomly apply Mixup to the provided batch and targets.
    The class implements the data augmentations as described in the paper
    `"mixup: Beyond Empirical Risk Minimization" <https://arxiv.org/abs/1710.09412>`_.
    Args:
        num_classes (int): number of classes used for one-hot encoding.
        p (float): probability of the batch being transformed. Default value is 0.5.
        alpha (float): hyperparameter of the Beta distribution used for mixup.
            Default value is 1.0.
        inplace (bool): boolean to make this transform inplace. Default set to False.
    """

    def __init__(self, num_classes: int, p: float = 0.5, alpha: float = 1.0, inplace: bool = False) -> None:
        super().__init__()

        if num_classes < 1:
            raise ValueError(
                f"Please provide a valid positive value for the num_classes. Got num_classes={num_classes}"
            )

        if alpha <= 0:
            raise ValueError("Alpha param can't be zero.")

        self.num_classes = num_classes
        self.p = p
        self.alpha = alpha
        self.inplace = inplace

    def forward(self, batch: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            batch (Tensor): Float tensor of size (B, C, H, W)
            target (Tensor): Integer tensor of size (B, )
        Returns:
            Tensor: Randomly transformed batch.
        """
        if batch.ndim != 4:
            raise ValueError(f"Batch ndim should be 4. Got {batch.ndim}")
        if target.ndim != 1:
            raise ValueError(f"Target ndim should be 1. Got {target.ndim}")
        if not batch.is_floating_point():
            raise TypeError(f"Batch dtype should be a float tensor. Got {batch.dtype}.")
        if target.dtype != torch.int64:
            raise TypeError(f"Target dtype should be torch.int64. Got {target.dtype}")

        if not self.inplace:
            batch = batch.clone()
            target = target.clone()

        if target.ndim == 1:
            target = torch.nn.functional.one_hot(target, num_classes=self.num_classes).to(dtype=batch.dtype)

        if torch.rand(1).item() >= self.p:
            return batch, target

        batch_rolled = batch.roll(1, 0)
        target_rolled = target.roll(1, 0)

        lambda_param = float(torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0])
        batch_rolled.mul_(1.0 - lambda_param)
        batch.mul_(lambda_param).add_(batch_rolled)

        target_rolled.mul_(1.0 - lambda_param)
        target.mul_(lambda_param).add_(target_rolled)

        return batch, target

    def __repr__(self) -> str:
        s = (
            f"{self.__class__.__name__}("
            f"num_classes={self.num_classes}"
            f", p={self.p}"
            f", alpha={self.alpha}"
            f", inplace={self.inplace}"
            f")"
        )
        return s

In [11]:
def run_test(net, test_loader, criterion):
    correct = 0
    total = 0
    avg_test_loss = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            outputs = net(images)
            predictions = torch.argmax(outputs, dim=1)
            labels = torch.argmax(labels, dim=1)
            correct += torch.sum(predictions == labels)
            total += labels.size(0)

    print(f'Accuracy of the network on the test images: {100 * correct / total:.2f} %')

In [12]:
def train(net, criterion, optimizer, num_epochs, print_freq = 100):
    for epoch in range(num_epochs):
        running_loss = 0.0
        running_correct = 0.0
        running_total = 0.0
        start_time = time.time()

        net.train()

        for i, (images, labels) in enumerate(train_loader, 0):
            images = images.to(device)
            labels = labels.to(device)
            images, labels = RandomMixup(num_classes, p=1, alpha=0.5)(images, torch.argmax(labels, dim=1))

            # Zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Get predicted results
            predicted = torch.argmax(outputs, dim=1)
            labels = torch.argmax(labels, dim=1)

            # print statistics
            running_loss += loss.item()

            # calculate accuracy
            running_total += labels.size(0)
            running_correct += (predicted == labels).sum().item()

            # print every 2000 mini-batches
            if i % print_freq == (print_freq - 1):
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / print_freq:.3f} acc: {100*running_correct / running_total:.2f} time: {time.time() - start_time:.2f}')
                running_loss, running_correct, running_total = 0.0, 0.0, 0.0
                start_time = time.time()

        # Run the run_test() function after each epoch
        net.eval()
        run_test(net, valid_loader, criterion)

In [13]:
train(net, criterion, optimizer, num_epochs=num_epochs)

save_dir = os.path.join(ckpt_dir, 'mixup.pt')
torch.save(net.state_dict(), save_dir)

[1,   100] loss: 137.396 acc: 14.81 time: 1275.59
Accuracy of the network on the test images: 32.20 %
[2,   100] loss: 105.691 acc: 35.44 time: 38.04
Accuracy of the network on the test images: 52.84 %
[3,   100] loss: 95.932 acc: 49.22 time: 38.28
Accuracy of the network on the test images: 59.90 %
[4,   100] loss: 87.598 acc: 55.44 time: 36.65
Accuracy of the network on the test images: 65.64 %
[5,   100] loss: 80.450 acc: 62.53 time: 39.13
Accuracy of the network on the test images: 68.49 %
[6,   100] loss: 79.759 acc: 61.81 time: 38.14
Accuracy of the network on the test images: 69.60 %
[7,   100] loss: 79.202 acc: 64.12 time: 38.28
Accuracy of the network on the test images: 71.63 %
[8,   100] loss: 75.742 acc: 66.16 time: 38.32
Accuracy of the network on the test images: 72.27 %
[9,   100] loss: 73.438 acc: 67.50 time: 36.51
Accuracy of the network on the test images: 72.92 %
[10,   100] loss: 72.040 acc: 67.81 time: 39.84
Accuracy of the network on the test images: 71.66 %
[11, 