
# ADRL Assignment 3 (Question 2)
## Team:
#### Anmol Anil Dhanuka (SR No: 21989)
#### Sharath C R (SR No: 21744)
#### Unity Chachei (SR No: 21916)

- Kaggle dataset for animal images used (Dataset has 16130 images of cats, dogs and wild animals)
- Objective:
    - Train MoCo Model on the dataset
    - Use the pretrained model and train a linear classifier

### Mount drive to load the dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Below class define the MoCo Model.

In [None]:
import torch
import torch.nn as nn

class MoCo(nn.Module):
    """
    MoCo model
    """
    def __init__(self, encoder, dim=128, K=128, m=0.999, T=0.07):
        super(MoCo, self).__init__()

        self.K = K
        self.m = m
        self.T = T

        # Create both query encoder and key encoder using resent18 model
        self.encoder_q = encoder(num_classes=dim)
        self.encoder_k = encoder(num_classes=dim)

        # Initialize the parameters of the model
        for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
            param_k.data.copy_(param_q.data)
            param_k.requires_grad = False

        # Create the queue
        self.register_buffer("queue", torch.randn(dim, K))
        self.queue = nn.functional.normalize(self.queue, dim=0)

        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))

    @torch.no_grad()
    def _momentum_update_key_encoder(self):
        # Update the key encoder using momentum
        for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
            param_k.data = param_k.data * self.m + param_q.data * (1. - self.m)

    @torch.no_grad()
    def _dequeue_and_enqueue(self, keys):
        batch_size = keys.shape[0]

        ptr = int(self.queue_ptr)
        # To make to the address updates happen in interger locations
        assert self.K % batch_size == 0

        # To replace the keys
        self.queue[:, ptr:ptr + batch_size] = keys.T

        # To update the pointer
        ptr = (ptr + batch_size) % self.K

        self.queue_ptr[0] = ptr

    def forward(self, im_q, im_k):

        # compute query features
        q = self.encoder_q(im_q)  # queries: NxC
        q = nn.functional.normalize(q, dim=1)

        # compute key features
        with torch.no_grad():  # no gradient to keys
            self._momentum_update_key_encoder()  # update the key encoder
            k = self.encoder_k(im_k)  # keys: NxC
            k = nn.functional.normalize(k, dim=1)

        # compute logits
        # Einstein sum is more intuitive
        # positive logits: Nx1
        l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1)
        # negative logits: NxK
        l_neg = torch.einsum('nc,ck->nk', [q, self.queue.clone().detach()])

        # logits: Nx(1+K)
        logits = torch.cat([l_pos, l_neg], dim=1)

        # apply temperature
        logits /= self.T

        # labels: positive key indicators
        labels = torch.zeros(logits.shape[0], dtype=torch.long).cuda()

        # dequeue and enqueue
        self._dequeue_and_enqueue(k)

        return logits, labels

## Below 2 classes are added to create the data augmentation that is needed

In [None]:
from PIL import ImageFilter
import random


class TwoCropsTransform:
    """Take two random crops of one image as the query and key."""

    def __init__(self, base_transform):
        self.base_transform = base_transform

    def __call__(self, x):
        q = self.base_transform(x)
        k = self.base_transform(x)
        return [q, k]


class GaussianBlur(object):
    """Gaussian blur augmentation in SimCLR"""

    def __init__(self, sigma=[.1, 2.]):
        self.sigma = sigma

    def __call__(self, x):
        sigma = random.uniform(self.sigma[0], self.sigma[1])
        x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
        return x

# Training of the MoCo Model is done below

In [None]:
#import argparse
import builtins
import math
import os
import random
import shutil
import time
import warnings

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models


model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))


data = '/content/drive/MyDrive/ADRL Assignment/data/'

arch = 'resnet18'
workers = 2
epochs = 100
start_epoch = 0
batch_size = 256
lr = 0.03
schedule = [120, 160]
momentum = 0.9
weight_decay = 1e-4
print_freq = 5
aug = False
seed = 123
gpu = 0

# moco specific configs:
moco_dim = 128
moco_k = 65536
moco_m = 0.999
moco_t = 0.07

def main():
    if seed is not None:
        random.seed(seed)
        torch.manual_seed(seed)
        cudnn.deterministic = True

    if gpu is not None:
        print("Using since GPU")
    ngpus_per_node = torch.cuda.device_count()
    main_worker(gpu, ngpus_per_node)


def main_worker(gpu, ngpus_per_node):
    # create model
    print("=> creating model '{}'".format(arch))
    model = MoCo(models.__dict__[arch],
                 moco_dim, moco_k, moco_m, moco_t, mlp)

    print(model)

    if gpu is not None:
        torch.cuda.set_device(gpu)
        model = model.cuda(gpu)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)

    optimizer = torch.optim.SGD(model.parameters(), lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(data, 'train')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    if aug:
        augmentation = [
            transforms.RandomResizedCrop(224, scale=(0.2, 1.)),
            transforms.RandomApply([
                transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)
            ], p=0.8),
            transforms.RandomGrayscale(p=0.2),
            transforms.RandomApply([GaussianBlur([.1, 2.])], p=0.5),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize
        ]
    else:
        augmentation = [
            transforms.RandomResizedCrop(224, scale=(0.2, 1.)),
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize
        ]

    train_dataset = datasets.ImageFolder(
        traindir,
        TwoCropsTransform(transforms.Compose(augmentation)))

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    for epoch in range(epochs):
        adjust_learning_rate(optimizer, epoch, lr)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        if (epoch+1)%20 == 0:
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': arch,
                'state_dict': model.state_dict(),
                'optimizer' : optimizer.state_dict(),
            }, is_best=False, filename='checkpoint_{:04d}.pth.tar'.format(epoch))


def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch))

    # Turn on train mode
    model.train()

    end = time.time()
    for i, (images, _) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        if gpu is not None:
            images[0] = images[0].cuda(gpu, non_blocking=True)
            images[1] = images[1].cuda(gpu, non_blocking=True)

        output, target = model(im_q=images[0], im_k=images[1])
        loss = criterion(output, target)

        # acc1 and acc5 are (K+1)-way contrast classifier accuracy
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images[0].size(0))
        top1.update(acc1[0], images[0].size(0))
        top5.update(acc5[0], images[0].size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            progress.display(i)


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch, lr):
    if cos:  # cosine lr schedule
        lr *= 0.5 * (1. + math.cos(math.pi * epoch / epochs))
    else:  # stepwise lr schedule
        for milestone in schedule:
            lr *= 0.1 if epoch >= milestone else 1.
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            #correct_k = correct[:k].view(-1).float().sum(0, keepdim=True
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res


if __name__ == '__main__':
    main()



=> creating model 'resnet18'
MoCo(
  (encoder_q): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum

## Fine-Tune the MoCo Resnet to adapt it for a 3 class classification problem

In [None]:
import builtins
import os
import random
import shutil
import time
import warnings

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))



data = '/content/drive/MyDrive/ADRL Assignment/data/'
#data = '.\\archive\\afhq\\'
arch = 'resnet18'
workers = 1
epochs=100
start_epoch=0
batch_size=256
lr=30
schedule=[60, 80]
momentum =0.9

weight_decay=0
print_freq=20
evaluate = False

seed = 123
gpu = 0

pretrained = 'checkpoint_0099.pth.tar'
best_acc1 = 0

def main():

    if seed is not None:
        random.seed(seed)
        torch.manual_seed(seed)
        cudnn.deterministic = True

    main_worker(gpu)


def main_worker(gpu):
    global best_acc1
    # create model
    print("=> creating model '{}'".format(arch))
    model = models.__dict__[arch]()

    # freeze all layers except the last fc layer
    for name, param in model.named_parameters():
        if name not in ['fc.weight', 'fc.bias']:
            param.requires_grad = False
    # init the fc layer
    model.fc.weight.data.normal_(mean=0.0, std=0.01)
    model.fc.bias.data.zero_()

    # load from pre-trained model
    if pretrained:
        if os.path.isfile(pretrained):
            print("=> loading checkpoint '{}'".format(pretrained))
            checkpoint = torch.load(pretrained, map_location="cpu")
            state_dict = checkpoint['state_dict']
            for k in list(state_dict.keys()):
                # retain only encoder_q up to before the embedding layer
                if k.startswith('module.encoder_q') and not k.startswith('module.encoder_q.fc'):
                    state_dict[k[len("module.encoder_q."):]] = state_dict[k]
                del state_dict[k]

            start_epoch = 0
            msg = model.load_state_dict(state_dict, strict=False)
            print("=> loaded pre-trained model '{}'".format(pretrained))
        else:
            print("=> no checkpoint found at '{}'".format(pretrained))

    torch.cuda.set_device(gpu)
    model = model.cuda(gpu)

    criterion = nn.CrossEntropyLoss().cuda(gpu)

    # Linear Classifier Optimization
    parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
    optimizer = torch.optim.SGD(parameters, lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

    cudnn.benchmark = True

    traindir = os.path.join(data, 'train')
    valdir = os.path.join(data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=batch_size, shuffle=False, drop_last=True)

    if evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(epochs):
        adjust_learning_rate(optimizer, epoch, lr)
        train(train_loader, model, criterion, optimizer, epoch)
        acc1 = validate(val_loader, model, criterion)
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if epoch%20 == 0:
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)


def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch))

    model.eval()
    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        data_time.update(time.time() - end)

        if gpu is not None:
            images = images.cuda(gpu, non_blocking=True)
        target = target.cuda(gpu, non_blocking=True)

        output = model(images)
        loss = criterion(output, target)

        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            progress.display(i)


def validate(val_loader, model, criterion):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            images = images.cuda(gpu, non_blocking=True)
            target = target.cuda(gpu, non_blocking=True)

            output = model(images)
            loss = criterion(output, target)

            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0:
                progress.display(i)

        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch, lr):
    for milestone in schedule:
        lr *= 0.1 if epoch >= milestone else 1.
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res


if __name__ == '__main__':
    main()



Use GPU: 0 for training
=> creating model 'resnet18'
=> loading checkpoint 'checkpoint_0099.pth.tar'
=> loaded pre-trained model 'checkpoint_0099.pth.tar'
Epoch: [0][ 0/57]	Time  2.373 ( 2.373)	Data  0.653 ( 0.653)	Loss 7.6482e+00 (7.6482e+00)	Acc@1   0.00 (  0.00)	Acc@5   0.00 (  0.00)
Epoch: [0][20/57]	Time  0.766 ( 0.792)	Data  0.626 ( 0.615)	Loss 1.7432e+05 (1.0701e+05)	Acc@1  35.55 ( 32.89)	Acc@5 100.00 ( 95.24)
Epoch: [0][40/57]	Time  0.675 ( 0.747)	Data  0.594 ( 0.608)	Loss 9.7622e+04 (9.6342e+04)	Acc@1  42.19 ( 33.08)	Acc@5 100.00 ( 97.56)
Test: [0/5]	Time  1.963 ( 1.963)	Loss 5.5082e+04 (5.5082e+04)	Acc@1   0.00 (  0.00)	Acc@5 100.00 (100.00)
 * Acc@1 21.875 Acc@5 100.000
Epoch: [1][ 0/57]	Time  0.704 ( 0.704)	Data  0.621 ( 0.621)	Loss 1.3691e+05 (1.3691e+05)	Acc@1  35.55 ( 35.55)	Acc@5 100.00 (100.00)
Epoch: [1][20/57]	Time  0.690 ( 0.705)	Data  0.605 ( 0.606)	Loss 3.2663e+04 (1.3314e+05)	Acc@1  40.62 ( 33.30)	Acc@5 100.00 (100.00)
Epoch: [1][40/57]	Time  0.731 ( 0.698)	Data 

# Fine-Tuning the MoCo model with less supervised data

In [None]:
import argparse
import builtins
import os
import random
import shutil
import time
import warnings

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.utils.data import Subset

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))



data = '/content/drive/MyDrive/ADRL Assignment/data/'
arch = 'resnet18'
workers = 1
epochs=5
start_epoch=0
batch_size=256
lr=30
schedule=[60, 80]
momentum =0.9
weight_decay=0
print_freq=20
evaluate = False
seed = 123
gpu = 0
pretrained = 'checkpoint_0099.pth.tar'
best_acc1 = 0


def main():

    if seed is not None:
        random.seed(seed)
        torch.manual_seed(seed)
        cudnn.deterministic = True

    main_worker(gpu)


def main_worker(gpu,):
    global best_acc1
    # create model
    print("=> creating model '{}'".format(arch))
    model = models.__dict__[arch]()

    # freeze all layers except the last fc
    for name, param in model.named_parameters():
        if name not in ['fc.weight', 'fc.bias']:
            param.requires_grad = False
    # init the fc layer
    model.fc.weight.data.normal_(mean=0.0, std=0.01)
    model.fc.bias.data.zero_()

    if pretrained:
        if os.path.isfile(pretrained):
            print("=> loading checkpoint '{}'".format(pretrained))
            checkpoint = torch.load(pretrained, map_location="cpu")

            state_dict = checkpoint['state_dict']
            for k in list(state_dict.keys()):
                # retain only encoder_q up to before the embedding layer
                if k.startswith('module.encoder_q') and not k.startswith('module.encoder_q.fc'):
                    state_dict[k[len("module.encoder_q."):]] = state_dict[k]
                # delete renamed or unused k
                del state_dict[k]

            start_epoch = 0
            msg = model.load_state_dict(state_dict, strict=False)

            print("=> loaded pre-trained model '{}'".format(pretrained))
        else:
            print("=> no checkpoint found at '{}'".format(pretrained))

    torch.cuda.set_device(gpu)
    model = model.cuda(gpu)

    criterion = nn.CrossEntropyLoss().cuda(gpu)

    parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
    assert len(parameters) == 2
    optimizer = torch.optim.SGD(parameters, lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

    cudnn.benchmark = True
    traindir = os.path.join(data, 'train')
    valdir = os.path.join(data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(128),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    # Reducing training data
    subset_idx1 = list(range(1000))
    train_sub_data1 = Subset(train_dataset,subset_idx1)

    train_loader = torch.utils.data.DataLoader(
        train_sub_data1, batch_size=batch_size, shuffle=True, drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(128),
            transforms.CenterCrop(128),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=batch_size, shuffle=False, drop_last=True)

    if evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(epochs):
        adjust_learning_rate(optimizer, epoch, lr)

        train(train_loader, model, criterion, optimizer, epoch)

        acc1 = validate(val_loader, model, criterion)

        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if epoch%20 == 0:
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)


def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch))

    model.eval()

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        data_time.update(time.time() - end)

        images = images.cuda(gpu, non_blocking=True)
        target = target.cuda(gpu, non_blocking=True)

        output = model(images)
        loss = criterion(output, target)

        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            progress.display(i)


def validate(val_loader, model, criterion):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            images = images.cuda(gpu, non_blocking=True)
            target = target.cuda(gpu, non_blocking=True)

            output = model(images)
            loss = criterion(output, target)

            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0:
                progress.display(i)

        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

class AverageMeter(object):
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch, lr):
    for milestone in schedule:
        lr *= 0.1 if epoch >= milestone else 1.
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res


if __name__ == '__main__':
    main()

GPU is set
=> creating model 'resnet18'
=> loading checkpoint 'checkpoint_0099.pth.tar'
=> loaded pre-trained model 'checkpoint_0099.pth.tar'
Epoch: [0][0/3]	Time  0.765 ( 0.765)	Data  0.527 ( 0.527)	Loss 7.7300e+00 (7.7300e+00)	Acc@1   0.00 (  0.00)	Acc@5   0.00 (  0.00)
Test: [0/5]	Time  0.591 ( 0.591)	Loss 0.0000e+00 (0.0000e+00)	Acc@1 100.00 (100.00)	Acc@5 100.00 (100.00)
 * Acc@1 39.062 Acc@5 39.062
Epoch: [1][0/3]	Time  0.608 ( 0.608)	Data  0.526 ( 0.526)	Loss 0.0000e+00 (0.0000e+00)	Acc@1 100.00 (100.00)	Acc@5 100.00 (100.00)
Test: [0/5]	Time  0.630 ( 0.630)	Loss 0.0000e+00 (0.0000e+00)	Acc@1 100.00 (100.00)	Acc@5 100.00 (100.00)
 * Acc@1 39.062 Acc@5 39.062
Epoch: [2][0/3]	Time  0.609 ( 0.609)	Data  0.518 ( 0.518)	Loss 0.0000e+00 (0.0000e+00)	Acc@1 100.00 (100.00)	Acc@5 100.00 (100.00)
Test: [0/5]	Time  0.585 ( 0.585)	Loss 0.0000e+00 (0.0000e+00)	Acc@1 100.00 (100.00)	Acc@5 100.00 (100.00)
 * Acc@1 39.062 Acc@5 39.062
Epoch: [3][0/3]	Time  0.587 ( 0.587)	Data  0.524 ( 0.524)	Lo

## Below is the Full Blown CNN built for classification on the same Dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Subset
from tqdm import tqdm  # Import tqdm

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        #self.conv4 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 16 * 16, 1024)
        #self.fc2 = nn.Linear(512, 512)
        #self.fc3 = nn.Linear(4096, 1024)
        self.fc3 = nn.Linear(1024, 3)  # 3 classes

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        #x = self.pool(torch.relu(self.conv4(x)))
        x = x.view(-1, 64 * 16 * 16)
        x = torch.relu(self.fc1(x))
        #x = torch.relu(self.fc2(x))
        #x = torch.relu(self.fc3(x))
        x = self.fc3(x)
        return x

In [None]:
model = CNN().to(device)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total Parameters: {}".format(total_params))

Total Parameters: 16804899


In [None]:
data_path = ".\\archive\\afhq\\"

## Training the CNN and testing the accuracy

In [None]:
# Load the dataset and apply transformations
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

train_dataset = ImageFolder(data_path+'train\\', transform=transform)
test_dataset = ImageFolder(data_path+'val\\', transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# Initialize the CNN model and move it to GPU
model = CNN().to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 10
print('Started training')
for epoch in range(num_epochs):
    running_loss = 0.0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))  # Initialize tqdm progress bar
    for i, data in pbar:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if (i + 1) % 10 == 0:  # Print every 10 mini-batches
            pbar.set_description(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / 10:.4f}")
            running_loss = 0.0

print("Training finished!")

# Evaluate the model on the test set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy on test set: {100 * accuracy:.2f}%")

Started training


Epoch [1/10], Loss: 0.2622: 100%|██████████| 458/458 [00:39<00:00, 11.60it/s]
Epoch [2/10], Loss: 0.1601: 100%|██████████| 458/458 [00:37<00:00, 12.06it/s]
Epoch [3/10], Loss: 0.0781: 100%|██████████| 458/458 [00:39<00:00, 11.58it/s]
Epoch [4/10], Loss: 0.0574: 100%|██████████| 458/458 [00:37<00:00, 12.07it/s]
Epoch [5/10], Loss: 0.0692: 100%|██████████| 458/458 [00:39<00:00, 11.46it/s]
Epoch [6/10], Loss: 0.0503: 100%|██████████| 458/458 [00:40<00:00, 11.35it/s]
Epoch [7/10], Loss: 0.0053: 100%|██████████| 458/458 [00:38<00:00, 12.04it/s]
Epoch [8/10], Loss: 0.0132: 100%|██████████| 458/458 [00:40<00:00, 11.20it/s]
Epoch [9/10], Loss: 0.0090: 100%|██████████| 458/458 [00:39<00:00, 11.49it/s]
Epoch [10/10], Loss: 0.0097: 100%|██████████| 458/458 [00:40<00:00, 11.32it/s]


Training finished!
Accuracy on test set: 96.80%


## CNN with lesser data

In [None]:
# Load the dataset and apply transformations
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

train_dataset = ImageFolder(data_path+'train\\', transform=transform)
test_dataset = ImageFolder(data_path+'val\\', transform=transform)

subset_idx = list(range(7000))
train_sub_data = Subset(train_dataset,subset_idx)

train_loader = DataLoader(train_sub_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)


# Initialize the CNN model and move it to GPU
model = CNN().to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 5
print('Started training')
for epoch in range(num_epochs):
    running_loss = 0.0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))  # Initialize tqdm progress bar
    for i, data in pbar:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if (i + 1) % 10 == 0:  # Print every 10 mini-batches
            pbar.set_description(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / 10:.4f}")
            running_loss = 0.0

print("Training finished!")

# Evaluate the model on the test set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy on test set: {100 * accuracy:.2f}%")

Started training


Epoch [1/5], Loss: 0.1768: 100%|██████████| 219/219 [00:18<00:00, 12.03it/s]
Epoch [2/5], Loss: 0.0695: 100%|██████████| 219/219 [00:19<00:00, 11.46it/s]
Epoch [3/5], Loss: 0.0744: 100%|██████████| 219/219 [00:19<00:00, 11.40it/s]
Epoch [4/5], Loss: 0.0437: 100%|██████████| 219/219 [00:19<00:00, 11.47it/s]
Epoch [5/5], Loss: 0.0290: 100%|██████████| 219/219 [00:19<00:00, 11.42it/s]


Training finished!
Accuracy on test set: 65.40%


# Observations

- Pre Training Stage
  - MoCo Training
    - Used NCE by sampling mini batches from dictionary (queues)
    - Encoder for keys was updated using momentum of 0.999
    - Number of Epochs trained for: 100
    - Top 1 Accuracy: 26.17
    - Top 5 Accuracy: 79.69
    - Base encoder model was : resnet18
      - Note that the original paper used resnet50 as the base encoder. We neither had enough compute nor time to train with resnet50
    - 3 different augmentation techniques were used to generate new data samples while training

- Fine Tuning Stage:
  - Linear Classification:
    - All layers of pretrained model except the FC layer were frozen (No gradient updates during training)
    - The FC layer was initialized again.
    - Update of FC layer parameters was done using supervised training to classify the data. (Both full dataset and subset was tried)
    - Two different classifiers were trained using 14k datapoints and 1k datapoints to perform comparison of how MoCo helps fine tuning tasks down the line after the pretraining has been done.
    - Fine tuning stats:
      - Full Dataset:
        - Train Acc 1: 59.77
        - Test Acc 1: 62.969 (higher than the quoted values in the original paper as the number of classes in the dataset is significantly smaller)
        - Number of epochs trained: 100
      - Smaller Dataset:
        - Train Acc 1: 100.0
        - Test Acc 1: 39.06
        - Number of epochs trained: 5
        - Note that the because the train accuracy is 100 from the get go, the gradient updates are zero and hence no matter how many epochs we train it for, test accuracy will not improve. (It could be due to less number of classes)

- CNN Training and Testing:
  - A full blown CNN with similar number or trainable parameters was implemented.
  - With full data set, the CNN obtained a test accuracy of 96.88
  - With 7k datasample (~half of full data), CNN obtained a test accuracy of 65.4
