# Assignment 03

In [1]:
import os
import shutil
import utils
import numpy as np
import random
import importlib
importlib.reload(utils)

from tqdm import tqdm
import torch
import torch.nn as nn
import torchvision.transforms as T
from torchvision import datasets, models
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
utils.set_random_seed()

In [3]:
### Setting up "constants", num Labels for layer sizes, and network outputs
NUM_LABELS = 196
BATCH_SIZE = 24
LR = 3e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_EPOCHS = 25

### **Augmentations**

For your experiments, use augmentations from the following types:
- Spatial Augmentations (rotation, mirroring, croppoing, ...)
- Use some other augmentations (color jitter, gaussian noise, ...).
- Use one (or more) of the following advanced augmentations:
   - **CutMix**: https://arxiv.org/pdf/1905.04899.pdf
   - **Mixup**: https://arxiv.org/pdf/1710.09412.pdf

In [4]:
# these spatial transforms seemed to work better than with color transforms

simple_transforms = T.Compose([
    T.ToTensor(),
    T.Resize((224,224)),
    T.RandomHorizontalFlip(p=0.25),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    # T.RandomResizedCrop(224),
    # T.GaussianBlur(kernel_size=(5,5)),
    # T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2)
])

resize_test = T.Compose([
    T.ToTensor(),
    T.Resize((224,224)),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

In [5]:
%%capture
train_set = datasets.StanfordCars(
    "./stanfordcars/", 
    split="train", 
    transform=simple_transforms, 
    download=True
)
test_set = datasets.StanfordCars(
    "./stanfordcars/", 
    split="test", 
    transform=resize_test,
    download=True
)


train_loader = torch.utils.data.DataLoader(
    train_set, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    pin_memory=True
)

test_loader = torch.utils.data.DataLoader(
    test_set, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    pin_memory=True
)

### Cutmix Implementation according to their paper, spatial augmentations seemed to work better. Initial training with color and blur seemed to perform worse, therefore we omitted them for a final training.

In [6]:
# cutmix implementation, implemented from pseudo code given by the paper
def CutMix(input, target):
    # get batch size to shuffle index of minibatch, and set a shuffled list
    B = list(range(input.shape[0]))
    shuffled_batch_idx = torch.tensor(random.sample(B, len(B)))

    # alg according to paper
    W = input[0].shape[1]
    H = input[0].shape[2]
    Lambda = torch.rand(1)
    r_x = torch.rand(1) * H
    r_y = torch.rand(1) * W
    r_w = torch.sqrt(1 - Lambda) * H
    r_h = torch.sqrt(1 - Lambda) * W

    x1 = int(torch.clamp((r_x - r_w / 2), min=0, max=W))
    x2 = int(torch.clamp((r_x + r_w / 2), min=0, max=W))
    y1 = int(torch.clamp((r_y - r_h / 2), min=0, max=H))
    y2 = int(torch.clamp((r_y + r_h / 2), min=0, max=H))
    
    # target = Lambda * target + (1 - Lambda) * target[shuffled_batch_idx]
    Lambda = 1 - ((x2-x1) * (y2 - y1) / (W*H))
    input[:, :, y1:y2, x1:x2] = input[shuffled_batch_idx, :, y1:y2, x1:x2]

    return input, target, shuffled_batch_idx, Lambda

In [7]:
def train_epoch(model, train_loader, optimizer, criterion, epoch, device=DEVICE):
    """ Training a model for one epoch """
    
    loss_list = []
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (images, labels) in progress_bar:
        images = images.to(device)
        labels = labels.to(device)
        
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
         
        prob_cutmix = torch.rand(1)
        if prob_cutmix > 0.75:
            # used for cutmix agumentation
            images, labels, shuffled_idx, Lambda = CutMix(images, labels)
            
            # Forward pass to get output/logits
            outputs = model(images)
            
            # Calculate Loss: softmax --> cross entropy loss
            # split loss values according to cutmix paper
            loss = criterion(outputs, labels) * Lambda  + criterion(outputs, labels[shuffled_idx]) * (1 - Lambda)
        else: 

            #compute output as usual
            outputs = model(images)
            loss = criterion(outputs, labels)

        loss_list.append(loss.item())
         
        # Getting gradients w.r.t. parameter
        loss.backward()
         
        # Updating parameters
        optimizer.step()

        progress_bar.set_description(f"Epoch {epoch+1} Iter {i+1}: loss {loss.item():.3f}. ")
        
    
    mean_loss = np.mean(loss_list)
    progress_bar.set_description(f"End Epoch {epoch}: loss {mean_loss:.3f}. ")
    return mean_loss, loss_list


@torch.no_grad()
def eval_model(model, eval_loader, criterion, device=DEVICE):
    """ Evaluating the model for either validation or test """
    correct = 0
    total = 0
    loss_list = []

    ### set up confusion matrix
    conf_matx = np.zeros((NUM_LABELS, NUM_LABELS))
    ### Correct = Accumulator of correctly labeled predictions
    correct = torch.zeros(1).to(device)

    
    for images, labels in eval_loader:
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass only to get logits/output
        outputs = model(images)
                 
        loss = criterion(outputs, labels)
        loss_list.append(loss.item())
            
        # Get predictions from the maximum value
        preds = torch.argmax(outputs, dim=1)
        correct += len( torch.where(preds==labels)[0] )
        total += len(labels)
        conf_matx += confusion_matrix(
            y_true=labels.cpu().numpy(), y_pred=preds.cpu().numpy(), labels=np.arange(0, NUM_LABELS, 1)
            )

                 
    # Total correct predictions and loss
    accuracy = correct / total * 100
    loss = np.mean(loss_list)
    
    return accuracy, loss, conf_matx


def train_model(model, optimizer, scheduler, criterion, train_loader, valid_loader, num_epochs, tboard, device=DEVICE, start_epoch=0):
    """ Training a model for a given number of epochs"""
    
    train_loss = []
    val_loss =  []
    loss_iters = []
    valid_acc = []
    
    for epoch in range(num_epochs):
        
        # setup for confusion matrix
        correct=torch.zeros(1).to(DEVICE)

        # validation epoch
        model.eval()  # important for dropout and batch norms
        accuracy, loss, _ = eval_model(model=model, eval_loader=valid_loader, criterion=criterion, device=device)
        valid_acc.append(accuracy)
        val_loss.append(loss)
        tboard.add_scalar(f'Accuracy/Valid', accuracy, global_step=epoch+start_epoch)
        tboard.add_scalar(f'Loss/Valid', loss, global_step=epoch+start_epoch)
        
        # training epoch
        model.train()  # important for dropout and batch norms
        mean_loss, cur_loss_iters = train_epoch(
                model=model, train_loader=train_loader, optimizer=optimizer,
                criterion=criterion, epoch=epoch, device=device
            )
        scheduler.step()
        train_loss.append(mean_loss)
        tboard.add_scalar(f'Loss/Train', mean_loss, global_step=epoch+start_epoch)

        loss_iters = loss_iters + cur_loss_iters
        
    print(f"Training completed")
    return train_loss, val_loss, loss_iters, valid_acc

### **Experiments:** 
#### **Experiments 1.1:**
 Using your aforementioned augmentions:
 - Fine-tune VGG, ResNet and ConvNext for your augmented dataset for car type classification and compare them.
 - Log your losses and accuracies into Tensorboard (or some other logging tool)
 - **Extra Point**: 
   - Fine-tune a Transformer-based model (e.g. ViT). Compare the performance (accuracy, confusion matrix, training time, loss landscape, ...) with the one from ResNet.

In [None]:
%%capture
# Capture, cause pretrained param has a deprecated warning, but still works
# Loading models
vgg_model = models.vgg16_bn(pretrained=True)

vgg_model.classifier = nn.Linear(7*7*512, NUM_LABELS)
vgg_model.to(DEVICE);

In [None]:
# classification loss function
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_vgg = torch.optim.Adam(vgg_model.parameters(), lr=3e-4)

# Decay LR by a factor of 0.5 every 5 epochs
scheduler_vgg = torch.optim.lr_scheduler.StepLR(optimizer_vgg, step_size=7, gamma=0.1)

In [None]:
writer_vgg = utils.make_tboard_logs("vgg_16_finetuned")
vgg_train_loss, vgg_val_loss, vgg_loss_iters, vgg_valid_acc = train_model(
        model=vgg_model, optimizer=optimizer_vgg, scheduler=scheduler_vgg, criterion=criterion,
        train_loader=train_loader, valid_loader=test_loader, num_epochs=NUM_EPOCHS, tboard=writer_vgg,
        device=DEVICE
)

Epoch 1 Iter 128: loss 5.560. : 100%|██████████| 128/128 [01:50<00:00,  1.16it/s]
Epoch 2 Iter 128: loss 2.195. : 100%|██████████| 128/128 [01:50<00:00,  1.16it/s]
Epoch 3 Iter 128: loss 3.165. : 100%|██████████| 128/128 [01:50<00:00,  1.15it/s]
Epoch 4 Iter 128: loss 0.609. : 100%|██████████| 128/128 [01:50<00:00,  1.16it/s]
Epoch 5 Iter 128: loss 0.233. : 100%|██████████| 128/128 [01:50<00:00,  1.16it/s]
Epoch 6 Iter 128: loss 0.209. : 100%|██████████| 128/128 [01:50<00:00,  1.16it/s]
Epoch 7 Iter 128: loss 0.281. : 100%|██████████| 128/128 [01:50<00:00,  1.15it/s]
Epoch 8 Iter 128: loss 0.040. : 100%|██████████| 128/128 [01:50<00:00,  1.16it/s]
Epoch 9 Iter 128: loss 3.118. : 100%|██████████| 128/128 [01:50<00:00,  1.16it/s]
Epoch 10 Iter 128: loss 2.412. : 100%|██████████| 128/128 [01:50<00:00,  1.16it/s]
Epoch 11 Iter 128: loss 0.031. : 100%|██████████| 128/128 [01:50<00:00,  1.16it/s]
Epoch 12 Iter 128: loss 0.020. : 100%|██████████| 128/128 [01:51<00:00,  1.15it/s]
Epoch 13 Iter

Training completed





In [None]:
utils.save_model(
    vgg_model, optimizer_vgg, NUM_EPOCHS, (vgg_train_loss, vgg_val_loss, vgg_loss_iters, vgg_valid_acc)
    )

In [None]:
%%capture
resnet18_model = models.resnet18(pretrained=True)

resnet18_model.classifier = nn.Linear(7*7*512, NUM_LABELS)
resnet18_model.to(DEVICE);

In [None]:
# classification loss function
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_resnet18 = torch.optim.Adam(resnet18_model.parameters(), lr=3e-4)

# Decay LR by a factor of 0.5 every 5 epochs
scheduler_resnet18 = torch.optim.lr_scheduler.StepLR(optimizer_resnet18, step_size=7, gamma=0.1)

In [None]:
writer_resnet18 = utils.make_tboard_logs("resnet_18_finetuned")
resnet18_train_loss, resnet18_val_loss, resnet18_loss_iters, resnet18_valid_acc = train_model(
        model=resnet18_model, optimizer=optimizer_resnet18, scheduler=scheduler_resnet18, criterion=criterion,
        train_loader=train_loader, valid_loader=test_loader, num_epochs=NUM_EPOCHS, tboard=writer_resnet18,
        device=DEVICE
)

Epoch 1 Iter 128: loss 3.236. : 100%|██████████| 128/128 [00:44<00:00,  2.87it/s]
Epoch 2 Iter 128: loss 1.510. : 100%|██████████| 128/128 [00:44<00:00,  2.89it/s]
Epoch 3 Iter 128: loss 4.157. : 100%|██████████| 128/128 [00:44<00:00,  2.90it/s]
Epoch 4 Iter 128: loss 0.586. : 100%|██████████| 128/128 [00:44<00:00,  2.89it/s]
Epoch 5 Iter 128: loss 0.831. : 100%|██████████| 128/128 [00:44<00:00,  2.89it/s]
Epoch 6 Iter 128: loss 1.078. : 100%|██████████| 128/128 [00:44<00:00,  2.87it/s]
Epoch 7 Iter 128: loss 2.375. : 100%|██████████| 128/128 [00:45<00:00,  2.84it/s]
Epoch 8 Iter 128: loss 0.065. : 100%|██████████| 128/128 [00:44<00:00,  2.88it/s]
Epoch 9 Iter 128: loss 0.120. : 100%|██████████| 128/128 [00:44<00:00,  2.86it/s]
Epoch 10 Iter 128: loss 0.366. : 100%|██████████| 128/128 [00:44<00:00,  2.89it/s]
Epoch 11 Iter 128: loss 2.622. : 100%|██████████| 128/128 [00:44<00:00,  2.89it/s]
Epoch 12 Iter 128: loss 0.072. : 100%|██████████| 128/128 [00:44<00:00,  2.86it/s]
Epoch 13 Iter

Training completed





In [None]:
%%capture
convnext_model = models.convnext_tiny(pretrained=True)
convnext_model.classifier[2] = nn.Linear(768, NUM_LABELS)

convnext_model.to(DEVICE);

In [None]:
# classification loss function
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_convnext = torch.optim.Adam(convnext_model.parameters(), lr=3e-4)

# Decay LR by a factor of 0.5 every 5 epochs
scheduler_convnext = torch.optim.lr_scheduler.StepLR(optimizer_convnext, step_size=7, gamma=0.1)

In [None]:
writer_convnext = utils.make_tboard_logs("convnext_tiny_finetuned")
convnext_train_loss, convnext_val_loss, convnext_loss_iters, convnext_valid_acc = train_model(
        model=convnext_model, optimizer=optimizer_convnext, scheduler=scheduler_convnext, criterion=criterion,
        train_loader=train_loader, valid_loader=test_loader, num_epochs=NUM_EPOCHS, tboard=writer_convnext,
        device=DEVICE
)

Epoch 1 Iter 128: loss 2.061. : 100%|██████████| 128/128 [03:12<00:00,  1.50s/it]
Epoch 2 Iter 128: loss 1.206. : 100%|██████████| 128/128 [02:32<00:00,  1.19s/it]
Epoch 3 Iter 128: loss 0.596. : 100%|██████████| 128/128 [02:32<00:00,  1.19s/it]
Epoch 4 Iter 128: loss 0.131. : 100%|██████████| 128/128 [02:52<00:00,  1.35s/it]
Epoch 5 Iter 128: loss 0.101. : 100%|██████████| 128/128 [02:32<00:00,  1.19s/it]
Epoch 6 Iter 128: loss 0.104. : 100%|██████████| 128/128 [02:32<00:00,  1.19s/it]
Epoch 7 Iter 128: loss 0.073. : 100%|██████████| 128/128 [02:32<00:00,  1.19s/it]
Epoch 8 Iter 128: loss 0.027. : 100%|██████████| 128/128 [02:32<00:00,  1.19s/it]
Epoch 9 Iter 128: loss 0.026. : 100%|██████████| 128/128 [02:32<00:00,  1.19s/it]
Epoch 10 Iter 128: loss 0.010. : 100%|██████████| 128/128 [02:35<00:00,  1.21s/it]
Epoch 11 Iter 128: loss 0.723. : 100%|██████████| 128/128 [02:32<00:00,  1.19s/it]
Epoch 12 Iter 128: loss 0.922. : 100%|██████████| 128/128 [02:32<00:00,  1.19s/it]
Epoch 13 Iter

Training completed





The ConvNeXt model (tiny) perfomed the best (92.43% accuracy) followed by the resnet18 Model (81.69% accuracy) and vgg16 with batch norm had the worst performance (78.14%). It also took the longest of them all (1.35h), vgg16 second longest (1.15h) while resnet was the fastest model (0.5h). \\

We can see that the validation loss of the convnext model is less than the training loss while resnet and vgg slightly overfit. 

The modern ConvNeXt model seems to outperform the older more traditional CNNs by a big margin, because of new transformer based techniques implemented for a RESnet50 network architecture, e.g. a multi-stage designs which includes different feature map resolution etc. . These modern additions give a huge advantage to ConvNeXt with regards to ResNet and VGG. The gap between ResNet and VGG can be explained through the skip layers being added in the ResNet.

The Tensorboard logs visualize these results.

#### **Experiments 1.2:**
 - Compare the following: Fine-Tuned ResNet, ResNet as fixed feature extractor, and ResNet with a Combined Approach
 - Log your losses and accuracies into Tensorboard (or some other logging tool)

In [None]:
# %%capture
# resnet as fixed feature extractor
resnet_18_fixed_model = models.resnet18(pretrained=True)

for params in resnet_18_fixed_model.parameters():
    params.requires_grad_(False)

resnet_18_fixed_model.fc = nn.Linear(512, NUM_LABELS)

resnet_18_fixed_model.to(DEVICE);

In [None]:
# classification loss function
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_resnet_18_fixed = torch.optim.Adam(resnet_18_fixed_model.parameters(), lr=3e-4)

# Decay LR by a factor of 0.5 every 5 epochs
scheduler_resnet_18_fixed = torch.optim.lr_scheduler.StepLR(optimizer_resnet_18_fixed, step_size=7, gamma=0.1)

writer_resnet_18_fixed = utils.make_tboard_logs("resnet_18_fixed_finetuned")
resnet_18_fixed_train_loss, resnet_18_fixed_val_loss, resnet_18_fixed_loss_iters, resnet_18_fixed_valid_acc = train_model(
        model=resnet_18_fixed_model, optimizer=optimizer_resnet_18_fixed, scheduler=scheduler_resnet_18_fixed, criterion=criterion,
        train_loader=train_loader, valid_loader=test_loader, num_epochs=NUM_EPOCHS, tboard=writer_resnet_18_fixed,
        device=DEVICE
)

Epoch 1 Iter 128: loss 4.930. : 100%|██████████| 128/128 [00:51<00:00,  2.48it/s]
Epoch 2 Iter 128: loss 4.624. : 100%|██████████| 128/128 [00:55<00:00,  2.31it/s]
Epoch 3 Iter 128: loss 4.293. : 100%|██████████| 128/128 [00:58<00:00,  2.20it/s]
Epoch 4 Iter 128: loss 4.072. : 100%|██████████| 128/128 [00:52<00:00,  2.42it/s]
Epoch 5 Iter 128: loss 4.039. : 100%|██████████| 128/128 [00:59<00:00,  2.15it/s]
Epoch 6 Iter 128: loss 3.901. : 100%|██████████| 128/128 [00:55<00:00,  2.32it/s]
Epoch 7 Iter 128: loss 3.439. : 100%|██████████| 128/128 [00:56<00:00,  2.27it/s]
Epoch 8 Iter 128: loss 3.079. : 100%|██████████| 128/128 [00:54<00:00,  2.36it/s]
Epoch 9 Iter 128: loss 3.541. : 100%|██████████| 128/128 [00:55<00:00,  2.29it/s]
Epoch 10 Iter 128: loss 4.632. : 100%|██████████| 128/128 [00:54<00:00,  2.34it/s]
Epoch 11 Iter 128: loss 3.317. : 100%|██████████| 128/128 [00:54<00:00,  2.35it/s]
Epoch 12 Iter 128: loss 3.478. : 100%|██████████| 128/128 [00:55<00:00,  2.30it/s]
Epoch 13 Iter

Training completed





In [None]:
utils.save_model(resnet_18_fixed_model, optimizer_resnet_18_fixed, 25, (resnet_18_fixed_train_loss, resnet_18_fixed_val_loss, resnet_18_fixed_loss_iters, resnet_18_fixed_valid_acc))

In [None]:
def optimizer_to(optim, device):
    for param in optim.state.values():
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)

In [None]:
%%capture
# resnet as fixed feature extractor
resnet_18_fixed_model = models.resnet18()
resnet_18_fixed_model.fc = nn.Linear(512, NUM_LABELS)

criterion = nn.CrossEntropyLoss()
optimizer_resnet_18_fixed = torch.optim.Adam(resnet_18_fixed_model.parameters(), lr=3e-4)

resnet_18_fixed_model, optimizer_resnet_18_fixed, _, _ = utils.load_model(
    resnet_18_fixed_model,
    optimizer_resnet_18_fixed,
    "./models/checkpoint_ResNet_epoch_25.pth"
    )

scheduler_resnet_18_fixed = torch.optim.lr_scheduler.StepLR(optimizer_resnet_18_fixed, step_size=7, gamma=0.1)

resnet_18_fixed_model.to(DEVICE);
optimizer_to(optimizer_resnet_18_fixed, DEVICE)

In [None]:
writer_resnet_18_fixed = utils.make_tboard_logs("resnet_18_joint_finetuned")
resnet_18_fixed_train_loss, resnet_18_fixed_val_loss, resnet_18_fixed_loss_iters, resnet_18_fixed_valid_acc = train_model(
        model=resnet_18_fixed_model, optimizer=optimizer_resnet_18_fixed, scheduler=scheduler_resnet_18_fixed, criterion=criterion,
        train_loader=train_loader, valid_loader=test_loader, num_epochs=NUM_EPOCHS, tboard=writer_resnet_18_fixed,
        device=DEVICE
)

Epoch 1 Iter 128: loss 1.712. : 100%|██████████| 128/128 [01:05<00:00,  1.97it/s]
Epoch 2 Iter 128: loss 0.985. : 100%|██████████| 128/128 [01:06<00:00,  1.92it/s]
Epoch 3 Iter 128: loss 0.656. : 100%|██████████| 128/128 [01:06<00:00,  1.91it/s]
Epoch 4 Iter 128: loss 0.711. : 100%|██████████| 128/128 [01:08<00:00,  1.86it/s]
Epoch 5 Iter 128: loss 0.356. : 100%|██████████| 128/128 [01:07<00:00,  1.89it/s]
Epoch 6 Iter 128: loss 0.679. : 100%|██████████| 128/128 [01:08<00:00,  1.88it/s]
Epoch 7 Iter 128: loss 0.112. : 100%|██████████| 128/128 [01:06<00:00,  1.92it/s]
Epoch 8 Iter 128: loss 0.090. : 100%|██████████| 128/128 [01:07<00:00,  1.89it/s]
Epoch 9 Iter 128: loss 0.139. : 100%|██████████| 128/128 [01:10<00:00,  1.82it/s]
Epoch 10 Iter 128: loss 2.328. : 100%|██████████| 128/128 [01:06<00:00,  1.93it/s]
Epoch 11 Iter 128: loss 0.077. : 100%|██████████| 128/128 [01:07<00:00,  1.90it/s]
Epoch 12 Iter 128: loss 0.115. : 100%|██████████| 128/128 [01:06<00:00,  1.92it/s]
Epoch 13 Iter

Training completed


By the results the finetuned was the best (81.69% accuracy). The classifier only approach only has an accuracy of 26%. The joint probability only gets an accuracy of 79.99% therefore still performing worse than the fine tuned resnet18.

## Transformer Training started: MaxVit transformer

In [None]:
%%capture
maxvit_model = models.maxvit_t(weights=models.MaxVit_T_Weights.DEFAULT)

maxvit_model.classifier[5] = nn.Linear(512, NUM_LABELS)

maxvit_model.to(DEVICE)

In [None]:
# classification loss function
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_maxvit = torch.optim.Adam(maxvit_model.parameters(), lr=3e-4)

# Decay LR by a factor of 0.5 every 5 epochs
scheduler_maxvit = torch.optim.lr_scheduler.StepLR(optimizer_maxvit, step_size=7, gamma=0.1)

writer_maxvit = utils.make_tboard_logs("maxvit_tiny_finetuned")
maxvit_train_loss, maxvit_val_loss, maxvit_loss_iters, maxvit_valid_acc = train_model(
        model=maxvit_model, optimizer=optimizer_maxvit, scheduler=scheduler_maxvit, criterion=criterion,
        train_loader=train_loader, valid_loader=test_loader, num_epochs=NUM_EPOCHS, tboard=writer_maxvit,
        device=DEVICE
)

Epoch 1 Iter 340: loss 2.659. : 100%|██████████| 340/340 [02:07<00:00,  2.67it/s]
Epoch 2 Iter 340: loss 3.281. : 100%|██████████| 340/340 [02:07<00:00,  2.66it/s]
Epoch 3 Iter 340: loss 0.702. : 100%|██████████| 340/340 [02:08<00:00,  2.64it/s]
Epoch 4 Iter 340: loss 0.208. : 100%|██████████| 340/340 [02:09<00:00,  2.64it/s]
Epoch 5 Iter 340: loss 3.734. : 100%|██████████| 340/340 [02:07<00:00,  2.66it/s]
Epoch 6 Iter 340: loss 3.342. : 100%|██████████| 340/340 [02:07<00:00,  2.67it/s]
Epoch 7 Iter 340: loss 0.496. : 100%|██████████| 340/340 [02:08<00:00,  2.64it/s]
Epoch 8 Iter 340: loss 2.508. : 100%|██████████| 340/340 [02:08<00:00,  2.65it/s]
Epoch 9 Iter 340: loss 0.032. : 100%|██████████| 340/340 [02:07<00:00,  2.66it/s]
Epoch 10 Iter 340: loss 2.534. : 100%|██████████| 340/340 [02:08<00:00,  2.66it/s]
Epoch 11 Iter 340: loss 0.030. : 100%|██████████| 340/340 [02:08<00:00,  2.64it/s]
Epoch 12 Iter 340: loss 0.043. : 100%|██████████| 340/340 [02:07<00:00,  2.66it/s]
Epoch 13 Iter

Training completed





With a final accuracy of 91.26% the MaxVit Transformer based model (default size) still loses to the ConvNeXt (tiny) with 92.43% accuray. We went with MaxVit Transformer model because it had one of the highest performances for transformer based models on benchmarking sites. We thought the transformer could have been a better performer but we were wrong and ConvNeXt still beat the transformer.




#### **Experiment 2:**
Try to get the best performance possible on this dataset
 - Fine-tune a pretrained neural network of your choice for classification.
 - Select a good training recipe: augmentations, optimizer, learning rate scheduling, classifier, loss function, ...


Because we saw that ConvNeXt is doing the best we try a bigger ConvNeXT Model and higher learning rate decay because it converged pretty fast.

In [None]:
%%capture
convnext_model = models.convnext_base(pretrained=True)
convnext_model.classifier[2] = nn.Linear(1024, NUM_LABELS)

convnext_model.to(DEVICE);

In [None]:
# classification loss function
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_convnext = torch.optim.Adam(convnext_model.parameters(), lr=3e-4)

# Decay LR by a factor of 0.5 every 5 epochs
scheduler_convnext = torch.optim.lr_scheduler.StepLR(optimizer_convnext, step_size=5, gamma=0.2)

In [None]:
writer_convnext = utils.make_tboard_logs("convnext_base_finetuned")
convnext_train_loss, convnext_val_loss, convnext_loss_iters, convnext_valid_acc = train_model(
        model=convnext_model, optimizer=optimizer_convnext, scheduler=scheduler_convnext, criterion=criterion,
        train_loader=train_loader, valid_loader=test_loader, num_epochs=NUM_EPOCHS, tboard=writer_convnext,
        device=DEVICE
)

Epoch 1 Iter 102: loss 2.597. : 100%|██████████| 102/102 [02:13<00:00,  1.31s/it]
Epoch 2 Iter 102: loss 0.741. : 100%|██████████| 102/102 [02:14<00:00,  1.32s/it]
Epoch 3 Iter 102: loss 0.440. : 100%|██████████| 102/102 [02:25<00:00,  1.42s/it]
Epoch 4 Iter 102: loss 0.223. : 100%|██████████| 102/102 [02:14<00:00,  1.32s/it]
Epoch 5 Iter 102: loss 0.251. : 100%|██████████| 102/102 [02:15<00:00,  1.33s/it]
Epoch 6 Iter 102: loss 0.047. : 100%|██████████| 102/102 [02:14<00:00,  1.32s/it]
Epoch 7 Iter 102: loss 1.827. : 100%|██████████| 102/102 [02:15<00:00,  1.32s/it]
Epoch 8 Iter 102: loss 0.055. : 100%|██████████| 102/102 [02:15<00:00,  1.33s/it]
Epoch 9 Iter 102: loss 1.640. : 100%|██████████| 102/102 [02:15<00:00,  1.33s/it]
Epoch 10 Iter 102: loss 0.036. : 100%|██████████| 102/102 [02:14<00:00,  1.32s/it]
Epoch 11 Iter 102: loss 0.076. : 100%|██████████| 102/102 [02:15<00:00,  1.32s/it]
Epoch 12 Iter 102: loss 2.137. : 100%|██████████| 102/102 [02:15<00:00,  1.33s/it]
Epoch 13 Iter

Training completed





In [14]:
convnext_tiny = models.convnext_tiny()
tiny_params = utils.count_parameters(convnext_tiny)

convnext_base = models.convnext_base()
base_params = utils.count_parameters(convnext_base)

print("Tiny ConvNeXt params:", tiny_params)
print("Base ConvNeXt params:",base_params)
print("Ratio base to tiny:", base_params/tiny_params)

Tiny ConvNeXt params: 28589128
Base ConvNeXt params: 88591464
Ratio base to tiny: 3.098781606770238


From the Tensorboard we can see that the base model of ConvNeXt is 0.83% better. With regard to the parameter amount this is the trade off for a little bit more accuracy while having a model that is 3 times the size. 