# ResNet-50

Hyperparameter tuning was done manually. Hyperparameters were not searched extensively, due to GPU limits which I run in to quite often. Starting point for hyperparameters were taken from here http://cs230.stanford.edu/projects_winter_2020/reports/32610274.pdf, but different optimizer and parametrization was used in the end.

*NOTE:* Model was trained on Colab, but notebook was refactored to use current project structure.

Verify GPU type

In [2]:
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


## Preprocess and load data

In [11]:
import datetime
import io
import os

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import ReduceLROnPlateau

from torchvision import transforms
from PIL import Image
from sklearn.model_selection import train_test_split

import torchvision.models as models
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


from src.data.preprocess_data import DatasetManager

# S3 bucket
import boto3
from dotenv import dotenv_values

Define constants to be used later. *LOG_PATH* and *MODEL_PATH* should be the root path of project and path for models, respectively

In [12]:
EMOTION_LIST = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
LOG_PATH = "/logs"
MODEL_PATH = 'models/resnet50'

# Set values
BATCH_SIZE = 128
VAL_SIZE = 0.2
N_EPOCHS = 100
INPUT_SIZE = 224
N_FEATURES = len(EMOTION_LIST)

In [13]:
preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(INPUT_SIZE),
    transforms.RandomAffine(degrees=10, translate=(0.2, 0.2)),
    transforms.RandomHorizontalFlip(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(INPUT_SIZE),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset_manager = DatasetManager(batch_size=BATCH_SIZE, test_size=0.2, 
                        validation_size=0.2, transform=preprocess, 
                        test_transform=test_preprocess)


train_loader, test_loader, val_loader = dataset_manager.load_dataloaders()

## Load model

Load model

In [5]:
resnet = models.resnet50(pretrained=True)

Freeze all layers except five latest Bottleneck layer bundles

In [6]:
for param in resnet.parameters():
    param.requires_grad = False

for param in resnet.layer3[4:].parameters():
    param.requires_grad = True

Choose device and initialize logging

In [7]:
# Create logging, initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
timestamp = datetime.datetime.now().strftime("%d-%m-%Y %H.%m")
writer = SummaryWriter(f'{LOG_PATH}/runs/ResNet-50-{timestamp}')

Replace fully-connected layer to suit our problem. Use heavy Dropout to account for overfitting. Choose *N_FEATURES* as output dimensions.

In [8]:
# Change layers to suit our problem
num_ftrs = resnet.fc.in_features
resnet.fc = nn.Sequential(
                nn.Linear(num_ftrs, 2048),
                nn.Dropout(0.5),
                nn.ReLU(),
                nn.Linear(1024,  N_FEATURES)
)

Define optimizer and scheduler for reducing learning rate on plateau. Data is highly imbalanced, so we need to calculate proportional weights for each of the classes, to get equal importance on each.

In [None]:
optim = torch.optim.Adam(resnet.parameters())
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=4)

class_weights = dataset_manager.calculate_class_weights()
class_weights = torch.FloatTensor(class_weights).to(device)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
resnet.to(device)

Start Tensorboard to monitor convergence. **NOTE**: you need to put your own log directory

In [None]:
# Start tensorboard
%load_ext tensorboard
%tensorboard --logdir 'ResNet-50-09-06-2021 13.06'

## Train model

Early stopping was used with 5 iterations of no improvement, but the early stop was used on training loss. Therefore the best model was handpicked based on Tensorboard metrics, before training loss and validation loss started converging a lot. All models which did improvement were saved. 

In [13]:
import timeit

early_stop = False
epochs_no_improve = 0
n_epochs_stop = 5
min_loss = np.inf

scaler = torch.cuda.amp.GradScaler()

for n_epoch in range(N_EPOCHS):
    loss_history = []
    acc_history = []
    val_loss = 0

    # Training step
    resnet.train()
    for i, batch in enumerate(train_loader):
        optim.zero_grad()
        X, y = batch[0].to(device), batch[1].to(device)

        outputs = resnet(X)
        y = y.long()
        loss = criterion(outputs, y)
        loss.backward()
        optim.step()
        
        loss_history.append(loss.detach())

        # Give some real-time indication for the user how the model is doing
        if i % (BATCH_SIZE // 15)  == 0:
            inds = torch.argmax(outputs, axis=1)
            acc = (inds == y).sum() / len(y)
            acc_history.append(acc)
            print(f"iteration: {i}, loss: {sum(loss_history) / len(loss_history)}, acc: {acc} ")
    
    mu_loss_train = sum(loss_history) / len(loss_history)
    batch_acc_train = sum(acc_history) / len(acc_history)
    print(f"TRAINING: epoch: {n_epoch}, loss: {mu_loss_train}, acc: {batch_acc_train}")
      
    # Validation step
    val_loss_history = []
    val_acc_history = []
    predictions = []
    errors = []
    resnet.eval()
    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            X, y = batch[0].to(device), batch[1].to(device)

            outputs = resnet(X)
            y = y.long()
            loss = criterion(outputs, y)
            
            # Calculate accuracy
            inds = torch.argmax(outputs, axis=1)
            acc = (inds == y).sum() / len(y)
            # Calculate where error happens, for tensorboard
            error_inds = np.where((inds != y).cpu())
            error_labels = y[error_inds[0]].cpu()

            errors.append(error_labels)
            predictions.append(inds)
            val_acc_history.append(acc)
            val_loss_history.append(loss.item())
        
        
        # Update metrics
        mu_loss_val = sum(val_loss_history) / len(val_loss_history)
        batch_acc_val = sum(val_acc_history) / len(val_acc_history)
        # Update scheduler
        scheduler.step(mu_loss_val)
        # Stack our List[Tensor()]
        errors = torch.stack([e for arr in errors for e in arr])
        predictions = torch.stack([p for arr in predictions for p in arr])
        
        # Write metrics to logs
        writer.add_scalars("Loss", {'train': mu_loss_train,
                                    'val': mu_loss_val}, n_epoch)
        writer.add_scalars("Accuracy", {'train': batch_acc_train,
                                        'val': batch_acc_val}, n_epoch)
        writer.add_histogram("Validation prediction error distribution", errors, n_epoch)
        writer.add_histogram("Validation prediction distribution", predictions, n_epoch)
        writer.flush()

        print(f"VALIDATION: epoch: {n_epoch}, loss: {mu_loss_val}, acc: {batch_acc_val}")
        
        # Early stop if loss doesn't improve
        if mu_loss_train < min_loss:
            # Save model
            torch.save({
            'epoch': n_epoch,
            'model_state_dict': resnet.state_dict(),
            'optimizer_state_dict': optim.state_dict(),
            'loss': mu_loss_train,
            }, f'{LOG_PATH}/{MODEL_PATH}/resnet50-batch_{BATCH_SIZE}_train_loss_{mu_loss_train}_val_{mu_loss_val}.pt')
            min_loss = mu_loss_train
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == n_epochs_stop:
                print(f"Early stop at epoch {n_epoch}!")
                break

iteration: 0, loss: 1.9489587545394897, acc: 0.1640625 
iteration: 8, loss: 2.145118236541748, acc: 0.109375 
iteration: 16, loss: 2.0745270252227783, acc: 0.1796875 
iteration: 24, loss: 2.042987108230591, acc: 0.171875 
iteration: 32, loss: 2.0059409141540527, acc: 0.15625 
iteration: 40, loss: 1.9749537706375122, acc: 0.1484375 
iteration: 48, loss: 1.9587393999099731, acc: 0.203125 
iteration: 56, loss: 1.9365102052688599, acc: 0.296875 
iteration: 64, loss: 1.9178107976913452, acc: 0.2109375 
iteration: 72, loss: 1.9034291505813599, acc: 0.1875 
iteration: 80, loss: 1.8984614610671997, acc: 0.21875 
iteration: 88, loss: 1.8905425071716309, acc: 0.28125 
iteration: 96, loss: 1.882032036781311, acc: 0.2890625 
iteration: 104, loss: 1.8758114576339722, acc: 0.1484375 
iteration: 112, loss: 1.8670763969421387, acc: 0.3671875 
iteration: 120, loss: 1.8614256381988525, acc: 0.21875 
TRAINING: epoch: 0, loss: 1.859533667564392, acc: 0.20947265625
VALIDATION: epoch: 0, loss: 1.84391507878