In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import torch
import numpy as np
import math
import time
import tqdm
from typing import Tuple
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as  models
from torch.utils.data import DataLoader
import torchvision.datasets.voc as voc
import torch.optim as optim
from torchvision import transforms
from torchvision.models import resnet18
import torch.utils.model_zoo as model_zoo

In [3]:
data_dir = '/content/drive/MyDrive/CS 444: DL for CV/Project/data/'
ckpt_dir = '/content/drive/MyDrive/CS 444: DL for CV/Project/checkpoints/'
object_categories = ['aeroplane', 'bicycle', 'bird', 'boat',
                     'bottle', 'bus', 'car', 'cat', 'chair',
                     'cow', 'diningtable', 'dog', 'horse',
                     'motorbike', 'person', 'pottedplant',
                     'sheep', 'sofa', 'train', 'tvmonitor']
num_classes = len(object_categories)
batch_size = 32
resnet_lr = 1e-5
fc_lr = 5e-3
num_epochs = 25

mean = [0.457342265910642, 0.4387686270106377, 0.4073427106250871]
std = [0.26753769276329037, 0.2638145880487105, 0.2776826934044154]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
np.random.seed(1902)
torch.manual_seed(1902)

<torch._C.Generator at 0x7f774803a310>

## Data Pipeline

Download the PASCAL VOC dataset and create train and val data loaders.

In [4]:
class PascalVOC_Dataset(voc.VOCDetection):
    """Pascal VOC Detection Dataset"""
    def __init__(self, root, image_set='train', download=False, transform=None, target_transform=None):
        super().__init__(root, image_set=image_set, download=download, transform=transform, target_transform=target_transform)
    
    def __getitem__(self, index):
        return super().__getitem__(index)
    
    def __len__(self):
        return len(self.images)

In [5]:
def encode_labels(target):
    """Encode multiple labels using 1/0 encoding"""
    ls = target['annotation']['object']
    j = []
    if type(ls) == dict:
        if int(ls['difficult']) == 0:
            j.append(object_categories.index(ls['name']))
    else:
        for i in range(len(ls)):
            if int(ls[i]['difficult']) == 0:
                j.append(object_categories.index(ls[i]['name']))
    k = np.zeros(len(object_categories))
    k[j] = 1
    return torch.from_numpy(k)

In [6]:
transformations = transforms.Compose([transforms.Resize((300, 300)),
                                      transforms.ToTensor()])
transformations_valid = transforms.Compose([transforms.Resize(330), 
                                            transforms.CenterCrop(300), 
                                            transforms.ToTensor()])

In [7]:
dataset_train = PascalVOC_Dataset(data_dir,
                                  image_set='train', 
                                  download=False, 
                                  transform=transformations, 
                                  target_transform=encode_labels)
train_loader = DataLoader(dataset_train, batch_size=batch_size, num_workers=2, shuffle=True)

dataset_valid = PascalVOC_Dataset(data_dir, 
                                  image_set='val', 
                                  download=False, 
                                  transform=transformations_valid, 
                                  target_transform=encode_labels)
valid_loader = DataLoader(dataset_valid, batch_size=batch_size, num_workers=2)

## Define Model

In [8]:
net = resnet18(pretrained=True)
net.avgpool = torch.nn.AdaptiveAvgPool2d(1)
num_ftrs = net.fc.in_features
net.fc = torch.nn.Linear(num_ftrs, num_classes)
net = net.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 196MB/s]


## Define Training Parameters

In [9]:
optimizer = optim.SGD([{'params': list(net.parameters())[:-1], 'lr': resnet_lr, 'momentum': 0.9},
                       {'params': list(net.parameters())[-1], 'lr': fc_lr, 'momentum': 0.9}])
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 12, eta_min=0, last_epoch=-1)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')

In [10]:
class Cutout(object):
    """Randomly mask out one or more patches from an image.
    Args:
        n_holes (int): Number of patches to cut out of each image.
        length (int): The length (in pixels) of each square patch.
    """
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, images):
        """
        Args:
            images (Tensor): Batch of images of size (B, C, H, W).
        Returns:
            Tensor: Images with n_holes of dimension length x length cut out of it.
        """
        for i in range(images.shape[0]):
            img = images[i]
            h = img.size(1)
            w = img.size(2)
            mask = np.ones((h, w), np.float32)

            for n in range(self.n_holes):
                y = np.random.randint(h)
                x = np.random.randint(w)

                len = np.random.randint(self.length[0], self.length[1])
                y1 = np.clip(y - len // 2, 0, h)
                y2 = np.clip(y + len // 2, 0, h)
                x1 = np.clip(x - len // 2, 0, w)
                x2 = np.clip(x + len // 2, 0, w)

                mask[y1: y2, x1: x2] = 0.

            mask = torch.from_numpy(mask).to(device)
            mask = mask.expand_as(img)
            img = img * mask
            images[i] = img

        return images

In [11]:
def run_test(net, test_loader, criterion):
    correct = 0
    total = 0
    avg_test_loss = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            outputs = net(images)
            predictions = torch.argmax(outputs, dim=1)
            labels = torch.argmax(labels, dim=1)
            correct += torch.sum(predictions == labels)
            total += labels.size(0)

    print(f'Accuracy of the network on the test images: {100 * correct / total:.2f} %')

In [12]:
def train(net, criterion, optimizer, num_epochs, print_freq = 100):
    for epoch in range(num_epochs):
        running_loss = 0.0
        running_correct = 0.0
        running_total = 0.0
        start_time = time.time()

        net.train()

        for i, (images, labels) in enumerate(train_loader, 0):
            images = images.to(device)
            labels = labels.to(device)
            images = Cutout(n_holes=1, length=[50, 150])(images)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Get predicted results
            predicted = torch.argmax(outputs, dim=1)
            labels = torch.argmax(labels, dim=1)

            # print statistics
            running_loss += loss.item()

            # calculate accuracy
            running_total += labels.size(0)
            running_correct += (predicted == labels).sum().item()

            # print every 2000 mini-batches
            if i % print_freq == (print_freq - 1):
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / print_freq:.3f} acc: {100*running_correct / running_total:.2f} time: {time.time() - start_time:.2f}')
                running_loss, running_correct, running_total = 0.0, 0.0, 0.0
                start_time = time.time()

        # Run the run_test() function after each epoch
        net.eval()
        run_test(net, valid_loader, criterion)

In [13]:
train(net, criterion, optimizer, num_epochs=num_epochs)

save_dir = os.path.join(ckpt_dir, 'cutout.pt')
torch.save(net.state_dict(), save_dir)

[1,   100] loss: 164.190 acc: 13.25 time: 1704.62
Accuracy of the network on the test images: 29.68 %
[2,   100] loss: 117.138 acc: 34.09 time: 38.20
Accuracy of the network on the test images: 51.61 %
[3,   100] loss: 96.738 acc: 49.25 time: 39.73
Accuracy of the network on the test images: 60.19 %
[4,   100] loss: 85.796 acc: 55.91 time: 43.21
Accuracy of the network on the test images: 63.11 %
[5,   100] loss: 77.894 acc: 60.56 time: 43.68
Accuracy of the network on the test images: 66.27 %
[6,   100] loss: 72.532 acc: 63.12 time: 43.75
Accuracy of the network on the test images: 66.65 %
[7,   100] loss: 69.144 acc: 64.47 time: 43.30
Accuracy of the network on the test images: 67.66 %
[8,   100] loss: 64.174 acc: 65.66 time: 42.44
Accuracy of the network on the test images: 68.18 %
[9,   100] loss: 61.402 acc: 67.34 time: 42.10
Accuracy of the network on the test images: 69.36 %
[10,   100] loss: 60.511 acc: 66.81 time: 42.87
Accuracy of the network on the test images: 69.00 %
[11, 