In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import torch
import numpy as np
import time
import tqdm

import torch.nn as nn
import torchvision.models as  models
from torch.utils.data import DataLoader
import torchvision.datasets.voc as voc
import torch.optim as optim
from torchvision import transforms
from torchvision.models import resnet18
import torch.utils.model_zoo as model_zoo

In [3]:
data_dir = '/content/drive/MyDrive/CS 444: DL for CV/Project/data'
ckpt_dir = '/content/drive/MyDrive/CS 444: DL for CV/Project/checkpoints'
object_categories = ['aeroplane', 'bicycle', 'bird', 'boat',
                     'bottle', 'bus', 'car', 'cat', 'chair',
                     'cow', 'diningtable', 'dog', 'horse',
                     'motorbike', 'person', 'pottedplant',
                     'sheep', 'sofa', 'train', 'tvmonitor']
num_classes = len(object_categories)
batch_size = 32
resnet_lr = 1e-4
fc_lr = 5e-3
num_epochs = 35

mean = [0.457342265910642, 0.4387686270106377, 0.4073427106250871]
std = [0.26753769276329037, 0.2638145880487105, 0.2776826934044154]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
np.random.seed(1902)
torch.manual_seed(1902)

<torch._C.Generator at 0x7f7490302410>

## Data Pipeline

Download the PASCAL VOC dataset and create train and val data loaders.

In [4]:
class PascalVOC_Dataset(voc.VOCDetection):
    """Pascal VOC Detection Dataset"""
    def __init__(self, root, image_set='train', download=False, transform=None, target_transform=None):
        super().__init__(root, image_set=image_set, download=download, transform=transform, target_transform=target_transform)
    
    def __getitem__(self, index):
        return super().__getitem__(index)
    
    def __len__(self):
        return len(self.images)

In [5]:
def encode_labels(target):
    """Encode multiple labels using 1/0 encoding"""
    ls = target['annotation']['object']
    j = []
    if type(ls) == dict:
        if int(ls['difficult']) == 0:
            j.append(object_categories.index(ls['name']))
    else:
        for i in range(len(ls)):
            if int(ls[i]['difficult']) == 0:
                j.append(object_categories.index(ls[i]['name']))
    k = np.zeros(len(object_categories))
    k[j] = 1
    return torch.from_numpy(k)

In [6]:
transformations = transforms.Compose([transforms.Resize((300, 300)),
                                      transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.IMAGENET),
                                      transforms.ToTensor(), 
                                      transforms.Normalize(mean=mean, std=std)])
transformations_valid = transforms.Compose([transforms.Resize(330), 
                                            transforms.CenterCrop(300), 
                                            transforms.ToTensor(), 
                                            transforms.Normalize(mean=mean, std=std)])

In [7]:
dataset_train = PascalVOC_Dataset(data_dir,
                                  image_set='train', 
                                  download=False, 
                                  transform=transformations, 
                                  target_transform=encode_labels)
train_loader = DataLoader(dataset_train, batch_size=batch_size, num_workers=2, shuffle=True)

dataset_valid = PascalVOC_Dataset(data_dir, 
                                  image_set='val', 
                                  download=False, 
                                  transform=transformations_valid, 
                                  target_transform=encode_labels)
valid_loader = DataLoader(dataset_valid, batch_size=batch_size, num_workers=2)

## Define Model

In [8]:
net = resnet18(pretrained=True)
net.avgpool = torch.nn.AdaptiveAvgPool2d(1)
num_ftrs = net.fc.in_features
net.fc = torch.nn.Linear(num_ftrs, num_classes)
net = net.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 353MB/s]


## Define Training Parameters

In [9]:
optimizer = optim.SGD([{'params': list(net.parameters())[:-1], 'lr': resnet_lr, 'momentum': 0.9},
                       {'params': list(net.parameters())[-1], 'lr': fc_lr, 'momentum': 0.9}])
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 12, eta_min=0, last_epoch=-1)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')

In [10]:
def run_test(net, test_loader, criterion):
    correct = 0
    total = 0
    avg_test_loss = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            outputs = net(images)
            predictions = torch.argmax(outputs, dim=1)
            labels = torch.argmax(labels, dim=1)
            correct += torch.sum(predictions == labels)
            total += labels.size(0)

    print(f'Accuracy of the network on the test images: {100 * correct / total:.2f} %')

In [11]:
def train(net, criterion, optimizer, num_epochs, print_freq=100):
    for epoch in range(num_epochs):
        running_loss = 0.0
        running_correct = 0.0
        running_total = 0.0
        start_time = time.time()

        net.train()

        for i, (images, labels) in enumerate(train_loader, 0):
            images = images.to(device)
            labels = labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Get predicted results
            predicted = torch.argmax(outputs, dim=1)
            labels = torch.argmax(labels, dim=1)

            # print statistics
            running_loss += loss.item()

            # calculate accuracy
            running_total += labels.size(0)
            running_correct += (predicted == labels).sum().item()

            # print every 2000 mini-batches
            if i % print_freq == (print_freq - 1):
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / print_freq:.3f} acc: {100*running_correct / running_total:.2f} time: {time.time() - start_time:.2f}')
                running_loss, running_correct, running_total = 0.0, 0.0, 0.0
                start_time = time.time()

        # Run the run_test() function after each epoch
        net.eval()
        run_test(net, valid_loader, criterion)

In [12]:
train(net, criterion, optimizer, num_epochs=num_epochs)

save_dir = os.path.join(ckpt_dir, 'autoaugment.pt')
torch.save(net.state_dict(), save_dir)

[1,   100] loss: 127.022 acc: 35.84 time: 2753.53
Accuracy of the network on the test images: 68.83 %
[2,   100] loss: 66.279 acc: 64.31 time: 42.16
Accuracy of the network on the test images: 71.10 %
[3,   100] loss: 55.791 acc: 67.28 time: 43.15
Accuracy of the network on the test images: 69.88 %
[4,   100] loss: 49.486 acc: 69.25 time: 42.90
Accuracy of the network on the test images: 71.60 %
[5,   100] loss: 45.251 acc: 71.22 time: 43.66
Accuracy of the network on the test images: 70.69 %
[6,   100] loss: 38.967 acc: 72.81 time: 43.46
Accuracy of the network on the test images: 72.14 %
[7,   100] loss: 34.889 acc: 73.09 time: 43.37
Accuracy of the network on the test images: 71.06 %
[8,   100] loss: 29.837 acc: 75.56 time: 42.95
Accuracy of the network on the test images: 71.53 %
[9,   100] loss: 26.737 acc: 76.16 time: 43.04
Accuracy of the network on the test images: 69.48 %
[10,   100] loss: 25.455 acc: 75.06 time: 42.97
Accuracy of the network on the test images: 71.75 %
[11,  