# Import Dependencies

In [1]:
%%capture
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim
import torchvision

import albumentations as A
from albumentations.pytorch import ToTensorV2

import numpy as np
import pandas as pd
import json
import cv2

import os
import math
import copy
import random

!pip install livelossplot
import livelossplot

!pip install timm
import timm

import pytorch_lightning as pl
from collections import Counter 
import fastai.vision.all as fastai 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
# Import Ranger Optimizer
%cd ..
!git clone https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer 
%cd Ranger-Deep-Learning-Optimizer
!pip install -e .
%cd ..
%cd working
import sys
sys.path.append("../Ranger-Deep-Learning-Optimizer")
from ranger import Ranger
import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

For Reproducibility:

In [2]:
def seed():
    seed = 42
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)
seed()

Process the Dataset

In [3]:
def process_ids(df):
    '''
    Hotel Ids range from 0 -> 60000, but there are only 7000 unique values.
    Thus, we should map the classes down to 7700 instead of 60000 and remap on inference.
    '''
    orig_index_values = df.hotel_id.values
    unique = sorted(list(set(orig_index_values)))
    orig2cls = {unique[i]: i for i in range(len(unique))}
    cls2orig = {i: unique[i] for i in range(len(unique))}
    return orig2cls, cls2orig

In [4]:
def get_folds(train_df, NUM_FOLDS):
    # Split into KFolds
    KFOLDS = []
    splitter = KFold(n_splits = 300, shuffle = True, random_state = 42)
    count = 0
    for train, test in splitter.split(train_df):
        train_split = train_df.iloc[train]
        test_split = train_df.iloc[test]
        KFOLDS += [(train_split, test_split)]
        count += 1
        if count == NUM_FOLDS:
            break
    return KFOLDS
def get_albumentations(IMAGE_SIZE):
    '''
    Loads Augmentations
    '''
    train_transforms = A.Compose([
        A.RandomResizedCrop(IMAGE_SIZE, IMAGE_SIZE, scale = (0.9, 0.9), p = 1),
        A.HorizontalFlip(p = 0.7),
        A.OneOf([
            A.Blur(), # Either Noise or Blur
            A.MultiplicativeNoise(),
        ], p=0.7),
        A.OpticalDistortion(distort_limit=1.0, p = 0.7),
        #A.CLAHE(clip_limit=4.0, p=0.7),
        A.ColorJitter(p = 0.7, brightness = 0.1, contrast = 0.1, hue = 0.1, saturation = 0.1),
        #A.OneOf([
        #    A.ImageCompression(),
        #    A.Downscale(scale_min=0.7, scale_max=0.95),
        #], p=0.2),
        #A.CoarseDropout(max_holes=8, max_height=int(IMAGE_SIZE * 0.05),
        #                   max_width=int(IMAGE_SIZE* 0.05), p=0.5),
        A.Cutout(num_holes = 32, max_h_size= 8, max_w_size =8),
        A.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=15, border_mode=0, p=0.85),
        A.Normalize(),
        ToTensorV2()
    ])

    test_transforms = A.Compose([
        A.Resize(IMAGE_SIZE, IMAGE_SIZE),
        A.Normalize(),
        ToTensorV2()
    ])
    
    return train_transforms, test_transforms

Dataset and Dataloader

In [5]:
def display_image_np(image):
    plt.imshow(image)
    plt.show()
def display_image_pt(image):
    plt.imshow(image.transpose(0,1).transpose(1, 2))
    plt.show()

In [6]:
class HotelDataset(torch.utils.data.Dataset):
    def __init__(self, df, orig2cls, cls2orig, image_size, transforms):
        self.df = df
        self.image_size = image_size
        self.indices = self.df.index.values
        self.orig2cls = orig2cls
        self.cls2orig = cls2orig
        self.transforms = transforms
        
    def decode_pred(self, idx):
        return self.cls2orig[idx]
    def encode_pred(self, idx):
        return self.orig2cls[idx]
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        GT = self.encode_pred(row['hotel_id'])
        file_path = Config.train_base_path + f"{row['chain']}/" + row['image']
        #print(file_path)
        # Load in Image
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        image = self.transforms(image = image)['image']
        return image, GT

In [7]:
def split_df(df, num):
    '''
    Splits up DataFrame for Overfitting Testing.
    '''
    overfit, _ = train_test_split(df, train_size = num / len(df), test_size = ((len(df) - num) / len(df)), random_state = 42)
    return overfit
class Config:
    '''
    Holds many states for the model.
    '''
    overfit_samples = -1
    train_path = "../input/hotel-id-2021-fgvc8/train.csv"
    train_base_path = '../input/hotel-id-2021-fgvc8/train_images/'
    
    train_df = pd.read_csv(train_path)
    orig2cls, cls2orig = process_ids(train_df) 
    NUM_CLASSES = len(orig2cls)
    # Overfit Split 
    if overfit_samples != -1:
        train_df, _ = train_test_split(train_df, train_size = overfit_samples, test_size = 1 - overfit_samples, random_state = 42)
    NUM_FOLDS = 1
    FOLDS = get_folds(train_df, NUM_FOLDS)
    
    IMAGE_SIZE = 320
    train_transforms, test_transforms = get_albumentations(IMAGE_SIZE)
    
    BATCH_SIZE = 32
    TEST_BATCH_SIZE = 64
    
    device = device
    
    # Training States
    NUM_EPOCHS = 6# 90000 images is a lot.


In [8]:
class LightningDataModule(pl.LightningDataModule):
    def __init__(self, KSPLITS):
        self.KSPLITS = KSPLITS
    def train_dataloader(self, idx):
        split = self.KSPLITS[idx][0]
        train_dataset = HotelDataset(split, Config.orig2cls, Config.cls2orig, Config.IMAGE_SIZE, Config.train_transforms)
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size = Config.BATCH_SIZE, shuffle = True, num_workers = 4, worker_init_fn = seed_worker, pin_memory = True)
        return train_dataloader
    def val_dataloader(self, idx):
        split = self.KSPLITS[idx][1]
        val_dataset = HotelDataset(split, Config.orig2cls, Config.cls2orig, Config.IMAGE_SIZE, Config.test_transforms)
        val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size = Config.TEST_BATCH_SIZE, shuffle = False, worker_init_fn = seed_worker, num_workers = 4, pin_memory = True)
        return val_dataloader
    def get_both(self, idx):
        return self.train_dataloader(idx), self.val_dataloader(idx)
dataModule = LightningDataModule(Config.FOLDS)

# BaseLine Model, Modified ResNet using Fast AI.

Fast AI vs Vanilla Pytorch

Vanilla Pytorch

In [9]:
class ConvBlockVanilla(pl.LightningModule):
    '''
    Basic ConvBlock with BN(Removed Later) 
    '''
    def __init__(self, in_features, out_features, kernel_size, padding, groups, stride, act = 'relu'):
        super().__init__()
        self.conv = nn.Conv2d(in_features, out_features, kernel_size = kernel_size, padding = padding, groups = groups, stride = stride, bias = False)
        self.bn = nn.BatchNorm2d(out_features)
        if act == 'relu':
            self.act1 = nn.ReLU(inplace = True)
        else:
            self.act1 = nn.SiLU(inplace = True)
    def forward(self, x):
        return self.bn(self.act1(self.conv(x)))

In [10]:
# Squeeze Attend Blocks(Traditionally Used in Semantic Segmentation, but I find it can work really well in classification too)
class ConvPlusBatchNorm(pl.LightningModule):
    def __init__(self, in_features, out_features, kernel_size, padding, groups, stride):
        super().__init__()
        self.conv = nn.Conv2d(in_features, out_features, kernel_size = kernel_size, padding = padding, groups = groups, stride = stride, bias = False)
        self.bn = nn.BatchNorm2d(out_features)
class CBAMSqueezeAttend(pl.LightningModule):
    def __init__(self, in_features, inner_features, squeeze_factor = 4, act = 'relu'):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.squeeze_factor = squeeze_factor
        
        self.avg_pool = nn.AvgPool2d(kernel_size = 5, padding = 2, stride = self.squeeze_factor)
        self.max_pool = nn.MaxPool2d(kernel_size = 5, padding = 2, stride = self.squeeze_factor)
        
        self.Squeeze = ConvBlockVanilla(self.in_features, self.inner_features, 3, 1, 1, 1, act = act)
        self.Excite = ConvPlusBatchNorm(self.inner_features, self.in_features, 3, 1, 1, 1)
        
        self.gamma = nn.Parameter(torch.zeros((1), device = self.device))
    def forward(self, x):
        '''
        Squeeze Attend Blocks
        '''
        avg_pool = self.avg_pool(x)
        max_pool = self.max_pool(x)
        
        squeeze_avg = self.Squeeze(avg_pool)
        squeeze_max = self.Squeeze(max_pool)
        
        excite_avg = self.Excite(squeeze_avg)
        excite_max = self.Excite(squeeze_max)
        
        excite = torch.sigmoid((excite_avg + excite_max) / 2)
        # Interpolate Up
        excite = F.interpolate(excite, scale_factor = self.squeeze_factor, mode = 'nearest')
        return excite * x * self.gamma + (1 - self.gamma) * x
    
class SESqueezeAttend(pl.LightningModule):
    def __init__(self, in_features, inner_features, squeeze_factor = 4, act = 'relu'):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.squeeze_factor = squeeze_factor 
        
        self.avg_pool = nn.AvgPool2d(kernel_size = 5, padding = 2, stride = self.squeeze_factor) 
        
        self.Squeeze = ConvBlockVanilla(self.in_features, self.inner_features, 3, 1, 1, 1, act = act)
        self.Excite = ConvPlusBatchNorm(self.inner_features, self.in_features, 3, 1, 1, 1)
        
        self.gamma = nn.Parameter(torch.zeros((1), device = self.device))
    def forward(self, x):
        pooled = self.avg_pool(x)
        squeeze = self.Squeeze(pooled) 
        excite = torch.sigmoid(self.Excite(squeeze))
        # Upsample
        excite = F.interpolate(excite, scale_factor = self.squeeze_factor, mode = 'nearest')
        return self.gamma * excite * x + (1 - self.gamma) * x

class CBAMVanilla(pl.LightningModule):
    '''
    Uses CBAM channel only, CBAM Spatial often messes up the features too much.
    '''
    def __init__(self, in_features, inner_features, dev):
        super().__init__()
        self.in_features = in_features 
        self.inner_features = inner_features
        self.dev = dev 
        
        self.Squeeze = nn.Linear(self.in_features, self.inner_features)
        self.act1 = nn.SiLU(inplace = True)
        self.Excite = nn.Linear(self.inner_features, self.in_features)
        
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        avg_pool = torch.mean(x, dim = -1)
        avg_pool = torch.mean(avg_pool, dim = -1)
        
        max_pool, _ = torch.max(x, dim = -1)
        max_pool, _ = torch.max(max_pool, dim = -1)
        
        squeeze_avg = self.act1(self.Squeeze(avg_pool))
        squeeze_max = self.act1(self.Squeeze(max_pool))
        
        excite_avg = self.Excite(squeeze_avg) 
        excite_max = self.Excite(squeeze_max)
        
        excite = torch.sigmoid((excite_avg + excite_max) / 2).unsqueeze(-1).unsqueeze(-1)
        return (excite * x) * self.gamma + (1 - self.gamma) * x # 1 - Gamma to assure magnitude of vectors remain around the same and we don't inflate outputs
        # gamma thresholds how much `excited` features, and how much `normal` to keep
class SEVanilla(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev):
        super().__init__()
        self.dev = dev
        self.in_features = in_features
        self.inner_features = inner_features
        
        self.Squeeze = nn.Linear(self.in_features, self.inner_features) 
        self.act1 = nn.SiLU(inplace = True)
        self.Excite = nn.Linear(self.inner_features, self.in_features)
        
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        avg_pool = torch.mean(x, dim = -1)
        avg_pool = torch.mean(avg_pool, dim = -1)
        
        squeeze = self.act1(self.Squeeze(avg_pool))
        excite = torch.sigmoid(self.Excite(squeeze)).unsqueeze(-1).unsqueeze(-1)
        return (excite * x) * self.gamma + (1 - self.gamma) * x

class SelfAttentionVanilla(pl.LightningModule):
    def __init__(self, in_features, inner_features, num_heads, dev):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.num_heads = num_heads
        self.dev = dev
        
        self.K = ConvPlusBatchNorm(self.in_features, self.inner_features * self.num_heads, 3, 1, 1, 1)
        self.V = ConvPlusBatchNorm(self.in_features, self.inner_features * self.num_heads, 3, 1, 1, 1)
        self.Q = ConvPlusBatchNorm(self.in_features, self.inner_features * self.num_heads, 3, 1, 1, 1)
        
        self.Linear = ConvPlusBatchNorm(self.inner_features * self.num_heads, self.in_features, 3, 1, 1,1)
        self.gamma = nn.Parameter(torch.zeros((1), dvice = self.dev))
    def forward(self, x):
        B, C, H, W = x.shape
        Keys = self.K(x)
        Values = self.V(x)
        Queries = self.Q(x)
        
        Keys = Keys.reshape(B, self.num_heads, self.inner_features, H, W)
        Values = Values.reshape(B, self.num_heads, self.inner_features, H, W)
        Queries = Queries.reshape(B, self.num_heads, self.inner_features, H, W)
        
        Keys = Keys.reshape(B * self.num_heads, self.inner_features, H * W)
        Values = Values.reshape(B * self.num_heads, self.inner_features, H * W)
        Queries = Queries.reshape(B * self.num_heads, self.inner_features, H * W)
        
        att_mat = F.softmax(torch.bmm(Keys.transpose(1, 2), Queries) / math.sqrt(self.inner_features))
        att_scores = torch.bmm(Values, att_mat)
        
        att_scores = att_scores.reshape(B, self.num_heads, self.inner_features, H, W)
        att_scores = att_scores.reshape(B, self.num_heads * self.inner_features, H * W)
        
        linear = self.Linear(att_scores)
        return linear * self.gamma + (1 - self.gamma) * x
class AttentionVanilla(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev, attention_type = 'se', act = 'relu'):
        super().__init__()
        assert attention_type in ['se', 'cbam', 'self', 'cbam_attend', 'se_attend']
        self.attention_type = attention_type
        if self.attention_type == 'se':
            self.layer = SEVanilla(in_features, inner_features, dev)
        elif self.attention_type == 'cbam':
            self.layer = CBAMVanilla(in_features, inner_features, dev)
        elif self.attention_type == 'cbam_attend':
            self.layer = CBAMSqueezeAttend(in_features, inner_features, squeeze_factor = 4, act = act)
        elif self.attention_type == 'se_attend':
            self.layer = SESqueezeAttend(in_features, inner_features, squeeze_factor = 4, act = act)
        else:
            self.layer = SelfAttentionVanilla(in_features, inner_features, 1, dev)
    def forward(self, x):
        return self.layer(x) 

In [11]:
class BottleNeckVanilla(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev, stochastic_depth = 0.2, attention_type = 'se', act = 'relu'):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.dev = dev
        self.stochastic_depth = stochastic_depth
        
        self.Squeeze = ConvBlockVanilla(self.in_features, self.inner_features, 1, 0, 1, 1, act = act)
        self.Process = ConvBlockVanilla(self.inner_features, self.inner_features, 3, 1, 1, 1, act = act) 
        self.Expand = ConvBlockVanilla(self.inner_features, self.in_features, 1, 0, 1, 1, act = act)
        self.SE = AttentionVanilla(self.in_features, self.in_features // 4, self.dev, attention_type = attention_type, act = act)
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
        
    def forward(self, x):
        if self.training and random.random() < self.stochastic_depth:
            return x # Stochastic Depth
        squeezed = self.Squeeze(x)
        processed = self.Process(squeezed)
        expanded = self.Expand(processed) 
        SE = self.SE(expanded) 
        return self.gamma * SE + (1 - self.gamma) * x
        
class InverseBottleNeckVanilla(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev, stochastic_depth = 0.2, attention_type = 'se', act = 'relu'):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.dev = dev
        self.stochastic_depth = stochastic_depth
        
        self.expand = ConvBlockVanilla(self.in_features, self.inner_features, 1, 0, 1, 1, act = act)
        self.process = ConvBlockVanilla(self.inner_features, self.inner_features, 3, 1, self.inner_features, 1, act = act)
        self.SE = AttentionVanilla(self.inner_features, self.inner_features // 4, self.dev, attention_type = attention_type, act = act)
        self.squeeze = ConvBlockVanilla(self.inner_features, self.in_features, 1, 0, 1, 1, act = act)
        
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        if self.training and random.random() < self.stochastic_depth:
            return x
        expanded = self.expand(x)
        processed = self.process(expanded)
        SE = self.SE(processed)
        squeezed = self.squeeze(SE)
        return self.gamma * squeezed + (1 - self.gamma) * x

In [12]:
# Certain Blocks can actually more effective when the downsampling is in the middle, this needs special blocks
# You can see this in NFNets, EffNets, and ResNets!
class BottleNeckDownSampler(pl.LightningModule):
    def __init__(self, in_features, inner_features, out_features, stride, dev, attention_type = 'se', act = 'relu'):
        # You can't exactly have a stochastic depth for this block.
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.out_features = out_features
        self.stride = stride 
        self.dev = dev
        
        self.Squeeze = ConvBlockVanilla(self.in_features, self.inner_features, 1, 0, 1, 1, act = act)
        self.Process = ConvBlockVanilla(self.inner_features, self.inner_features, 3, 1, self.inner_features, self.stride, act = act) 
        self.Expand = ConvBlockVanilla(self.inner_features, self.out_features, 1, 0, 1, 1, act = act )
        self.SE = AttentionVanilla(self.out_features, self.out_features // 4, self.dev, attention_type = attention_type, act = act)
        
        self.pool = nn.AvgPool2d(kernel_size = 3, padding = 1, stride = self.stride)
        self.pool_conv = ConvBlockVanilla(self.in_features, self.out_features, 1, 0, 1, 1, act = act)
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        pooled = self.pool_conv(self.pool(x)) # Capture orig info
        squeezed = self.Squeeze(x)
        processed = self.Process(squeezed)
        expand = self.Expand(processed)
        excited = self.SE(expand)
        
        return excited * self.gamma + (1 - self.gamma) * pooled
class InverseBottleNeckDownSampler(pl.LightningModule):
    def __init__(self, in_features, inner_features, out_features, stride, dev, attention_type = 'se', act = 'relu'):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.out_features = out_features
        self.stride = stride
        self.dev = dev
        
        self.Expand = ConvBlockVanilla(self.in_features, self.inner_features, 1, 0, 1, 1, act = act)
        self.Process = ConvBlockVanilla(self.inner_features, self.inner_features, 3, 1, self.inner_features, self.stride, act = act)
        self.SE = AttentionVanilla(self.inner_features, self.inner_features // 4, self.dev, attention_type = attention_type, act = act)
        self.Squeeze = ConvBlockVanilla(self.inner_features, self.out_features, 1, 0, 1, 1, act = act)
        
        self.pool = nn.AvgPool2d(kernel_size = 3, padding = 1, stride = self.stride)
        self.pool_conv = ConvBlockVanilla(self.in_features, self.out_features, 1, 0, 1, 1, act = act)
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        pool = self.pool_conv(self.pool(x)) 
        expanded = self.Expand(x)
        processed = self.Process(expanded)
        SE = self.SE(processed)
        squeezed = self.Squeeze(SE)
        return squeezed * self.gamma + (1 - self.gamma) * pool

Fast AI 

In [13]:
# FAST AI Abstractions
class ConvBlockFA(pl.LightningModule):
    '''
    ConvLayer + BN, all abstracted away
    '''
    def __init__(self, in_features, out_features, kernel_size, padding, groups, stride, transposed = False, activation = nn.SiLU):
        super().__init__()
        self.layer = fastai.ConvLayer(in_features, out_features, ks = kernel_size, padding = padding, groups = groups, stride = stride, act_cls = activation, transpose = transposed)
    def forward(self, x):
        return self.layer(x)
class SEFA(pl.LightningModule):
    def __init__(self, in_features, inner_features):
        super().__init__()
        self.layer = fastai.SEModule(in_features, inner_features)
    def forward(self, x):
        return self.layer(x)
class SelfAttentionFA(pl.LightningModule):
    def __init__(self, in_features):
        super().__init__()
        self.layer = fastai.SelfAttention(in_features)
    def forward(self, x):
        return self.layer(x)
class BottleNeckFA(pl.LightningModule):
    def __init__(self, in_features, out_features, stochastic_depth = 0.2):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.stochastic_depth = stochastic_depth
        self.layer = fastai.SEBlock(1, in_features, out_features, reduction = 4) # Squeeze Factor of 4.
    def forward(self, x):
        if self.training and random.random() < self.stochastic_depth:
            return x
        return self.layer(x)
class InverseBottleNeckFA(pl.LightningModule):
    def __init__(self, in_features, out_features, stochastic_depth = 0.2):
        super().__init__()
        self.stochastic_depth = stochastic_depth
        self.layer = fastai.SEBlock(2, in_features, out_features, reduction = 4, dw = True)
    def forward(self, x):
        if self.training and random.random() < self.stochastic_depth:
            return x
        return self.layer(x) 

In [14]:
class SE(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev, framework = 'fastai'):
        super().__init__()
        self.framework = framework 
        if self.framework == 'fastai':
            self.layer = SEFA(in_features, in_features)
        else:
            self.layer = SEVanilla(in_features, inner_features, dev)
    def forward(self, x):
        return self.layer(x)
class SelfAttention(pl.LightningModule):
    def __init__(self, in_features, framework = 'fastai'):
        super().__init__()
        self.framework = framework
        if self.framework == 'fastai':
            self.layer = SelfAttentionFA(in_features)
        else:
            self.layer = SelfAttention(in_features)
    def forward(self, x):
        return self.layer(x)
class Attention(pl.LightningModule):
    '''
    Select Between SE or CBAM in one module.
    '''
    def __init__(self, in_features, inner_features, dev, framework = 'fastai', attention_type = 'se', act = 'relu'):
        assert attention_type in ['se', 'cbam', 'self', 'none', 'se_attend', 'cbam_attend']
        super().__init__()
        self.framework = framework
        if attention_type == 'none':
            self.layer = nn.Identity()
        elif self.framework == 'fastai':
            if attention_type == 'se':
                self.layer = SE(in_features, inner_features, dev, framework = framework)
            else:
                self.layer= SelfAttention(in_features, framework = framework)
        else:
            self.layer = AttentionVanilla(in_features, inner_features, dev, attention_type = attention_type, act = act)
    def forward(self, x):
        return self.layer(x)

Select Between FrameWorks

In [15]:
# FAST AI Abstractions
class ConvBlock(pl.LightningModule):
    '''
    ConvLayer + BN, all abstracted away
    '''
    def __init__(self, in_features, out_features, kernel_size, padding, groups, stride, transposed = False, activation = nn.SiLU, framework = 'fastai', act = 'relu'):
        super().__init__()
        self.framework = framework
        if self.framework == 'fastai':
            self.layer = ConvBlockFA(in_features, out_features, kernel_size, padding, groups, stride, transposed = transposed, activation = activation)
        else:
            self.layer = ConvBlockVanilla(in_features, out_features, kernel_size, padding, groups, stride, act = act)
    def forward(self, x):
        return self.layer(x)

class BottleNeck(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev, stochastic_depth = 0.2, framework = 'fastai', attention_type = 'se', act = 'relu'):
        super().__init__()
        self.framework = framework
        if self.framework == 'fastai':
            self.layer = BottleNeckFA(in_features, in_features, stochastic_depth = stochastic_depth)
        else:
            self.layer = BottleNeckVanilla(in_features, inner_features, dev, stochastic_depth = stochastic_depth, attention_type = attention_type, act = act) 
    def forward(self, x):
        return self.layer(x)
class InverseBottleNeck(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev, stochastic_depth = 0.2, framework = 'fastai', attention_type= 'se', act = 'relu'):
        super().__init__()
        self.framework = framework
        if self.framework == 'fastai':
            self.layer = InverseBottleNeckFA(in_features, in_features, stochastic_depth = stochastic_depth)
        else:
            self.layer = InverseBottleNeckVanilla(in_features, inner_features, dev, stochastic_depth = stochastic_depth, attention_type = attention_type, act = act) 
    def forward(self, x):
        return self.layer(x)


# Resizing Module

In [16]:
class Resizer(pl.LightningModule):
    '''
    Resizes Images from 1024 x 1024 to 320 x 320 for the CNNs. Light CNN + Bilinear Interpolation(CNN should capture key information)
    NO STOCHASTIC DEPTH here(its already a light model.)
    '''
    def __init__(self, out_size, dev, framework = 'fastai', attention_type = 'se', act = 'relu'):
        super().__init__()
        self.out_size = out_size
        self.dev = dev
        self.framework = framework
        self.stochastic_depth = 0
        # CNN Parts
        
        self.Initial = nn.Sequential(*[
            ConvBlock(3, 8, 3, 1, 1, 1, framework = self.framework, act = act),
            ConvBlock(8, 16, 1, 0, 1, 1, framework = self.framework, act = act )
        ])
        
        self.Process = nn.Sequential(*[
            BottleNeck(16, 4, self.dev, stochastic_depth = self.stochastic_depth, framework = self.framework, attention_type= attention_type, act = act) for i in range(1)
        ])

        self.proj = ConvBlock(16, 3, 3, 1, 1, 1, framework = self.framework, act = act)
     
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        '''
        X: Tensor(B, 3, 1024, 1024)
        '''
        resize = F.interpolate(x, size = (self.out_size, self.out_size))
        
        initial = self.Initial(x)
        resized_conv = F.interpolate(initial, size = (self.out_size, self.out_size))
        process = self.Process(resized_conv) + resized_conv
        proj = self.proj(process)
        return self.gamma * proj + (1 - self.gamma) * resize

BaseLine Model

In [17]:
class BaseLineResNet(pl.LightningModule):
    '''
    BaseLine ResNet-50D
    '''
    def freeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = False
    def unfreeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = True
    def increase_drop_prob(self):
        self.drop_prob += self.increase_dropout
    def increase_stochasticity(self):
        self.cur_stochastic += self.stochastic_increase
    def __init__(self, num_classes, dev, increase_dropout = 0.1, framework = 'fastai', stochastic_depth = False, act = 'relu'):
        super().__init__()
        self.num_classes = num_classes
        self.dev = dev
        self.framework = framework
        self.stochastic_depth = stochastic_depth
        # HYPER PARAMETER ------------------------------------
        self.model_name = 'resnet50d'
        self.increase_dropout = increase_dropout
        self.drop_prob = 0
        if not self.stochastic_depth:
            self.cur_stochastic = 0
            self.stochastic_increase = 0
        else:
            self.cur_stochastic = 0
            self.stochastic_increase = 0.1
        self.attention_type = 'se'
        # END OF HYPER PARAMETERS ----------------------------
        
        # Pretrained Model
        self.model = timm.create_model(self.model_name, pretrained = True)
        # Extract Layers
        self.conv1 = self.model.conv1 # (64)
        self.bn1 = self.model.bn1
        self.act1 = self.model.act1
        self.pool = self.model.maxpool
        
        self.layer1 = self.model.layer1 # (256)
        self.layer2 = self.model.layer2 # (512)
        self.layer3 = self.model.layer3 # (1024)
        self.layer4 = self.model.layer4 # we won't use this layer, it's too computationally expensive and doesn't fully use the sizes.
        
        # Freeze Initial Layers
        self.freeze([self.conv1, self.bn1, self.layer1, self.layer2])  
        # Custom Layers
        
        self.DropoutLayer2 = nn.Dropout2d(self.drop_prob)
        self.AttentionLayer2 = Attention(512, 128, self.dev, attention_type = self.attention_type, act = act, framework = self.framework) 
        self.increase_drop_prob()
        self.DropoutLayer3 = nn.Dropout2d(self.drop_prob)
        self.AttentionLayer3 = Attention(1024, 256, self.dev, attention_type = self.attention_type, act = act, framework = self.framework)
        self.increase_drop_prob()
        self.DropoutLayer4 = nn.Dropout2d(self.drop_prob)
        self.AttentionLayer4 = Attention(1256, 320, self.dev, attention_type = self.attention_type, act = act, framework = self.framework)
        self.increase_drop_prob()
        def add_layer(x):
            self.increase_stochasticity()
            return x
        self.layer4 = nn.Sequential(*[
            BottleNeckDownSampler(1024, 256, 1256, 2, self.dev, attention_type = self.attention_type, act = act)
        ] + [
            add_layer(BottleNeck(1256, 320, self.dev, stochastic_depth = self.stochastic_depth, framework = self.framework, attention_type = self.attention_type, act = act)) for i in range(3) 
        ])
        
        
        self.layer5 = nn.Sequential(*[
            BottleNeckDownSampler(1256, 320, 1536, 2, self.dev, attention_type = self.attention_type, act = act)
        ] + [
            add_layer(BottleNeck(1536, 512, self.dev, stochastic_depth = self.stochastic_depth, framework = self.framework, attention_type = self.attention_type, act = act)) for i in range(2)
        ])
        
        self.global_avg = nn.AdaptiveAvgPool2d((1, 1))
        self.increase_drop_prob()
        self.DropoutFinal = nn.Dropout(self.drop_prob)
        self.Linear = nn.Linear(1536, self.num_classes)
        
    def forward(self, x):
        '''
        BaseLine(BN) resnet Model
        '''
        features0 = self.pool(self.bn1(self.act1(self.conv1(x)))) # (B, 64, 160, 160)
        layer1 = self.layer1(features0) # (B, 256, 80, 80)
        layer2 = self.layer2(layer1) # (B, 512, 40, 40)
        # Attention2
        layer2 = self.DropoutLayer2(layer2)
        layer2 = self.AttentionLayer2(layer2)
        
        layer3 = self.layer3(layer2) # (B, 1024, 20, 20)
        # Attention3
        layer3 = self.DropoutLayer3(layer3)
        layer3 = self.AttentionLayer3(layer3)
        
        layer4 = self.layer4(layer3) # (B, 1256, 10, 10) 
        # Attention4
        layer4 = self.DropoutLayer4(layer4)
        layer4 = self.AttentionLayer4(layer4)
        
        layer5 = self.layer5(layer4) # (B, 1536, 5, 5) 
        # Avg Pool
        avg_pool = torch.squeeze(self.global_avg(layer5))
        dropped = self.DropoutFinal(avg_pool)
        return self.Linear(dropped)
        
        

class BaseLineEffNet(pl.LightningModule):
    '''
    BaseLine EffNet-b4
    '''
    def freeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = False
    def unfreeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = False
    def increase_stochasticity(self):
        '''
        Increases the Factor of Stochastic Depth.
        '''
        self.cur_stochastic += self.stochastic_increase
    def increase_drop_prob(self):
        self.drop_prob += self.increase_dropout
    def __init__(self, num_classes, dev, increase_dropout = 0.2, framework = 'fastai', stochastic_depth = False, act = 'silu'):
        super().__init__()
        self.framework = framework
        self.dev = dev
        self.num_classes = num_classes
        self.stochastic_depth = stochastic_depth
        
        # HYPER PARAMETERS -----------------------------------
        self.drop_prob = 0
        self.increase_dropout = increase_dropout
        self.model_name = 'tf_efficientnet_b4_ns'
        self.attention_type = 'se'
        if self.stochastic_depth == False:
            self.cur_stochastic = 0.0
            self.stochastic_increase = 0.0
        else:
            self.cur_stochastic = 0.2
            self.stochastic_increase = 0.1
        # END OF HYPER PARAMETERS ----------------------------
        
        self.model = timm.create_model(self.model_name, pretrained = True)
        # Extract Layers 
        self.conv1 = self.model.conv_stem
        self.bn1 = self.model.bn1
        self.act1 = self.model.act1
        
        self.block0 = self.model.blocks[0]
        self.block1 = self.model.blocks[1]
        self.block2 = self.model.blocks[2]
        self.block3 = self.model.blocks[3]
        self.block4 = self.model.blocks[4]
        self.block5 = self.model.blocks[5]
        self.block6 = self.model.blocks[6]
        # Freeze Layers
        self.freeze([self.conv1, self.bn1, self.block0, self.block1, self.block2])
        # Custom Layers
        self.Dropout0 = nn.Dropout2d(self.drop_prob)
        self.increase_drop_prob()
        self.Attention0 = Attention(56, 16, self.dev, attention_type = self.attention_type, act = act, framework = self.framework) 
        
        self.Dropout1 = nn.Dropout2d(self.drop_prob)
        self.increase_drop_prob()  
        self.Attention1 = Attention(160, 48, self.dev, attention_type = self.attention_type, act = act, framework = self.framework)
        self.Dropout2 = nn.Dropout2d(self.drop_prob)
        self.increase_drop_prob()
        self.Attention2 = Attention(448, 128, self.dev, attention_type = self.attention_type, act = act, framework = self.framework )
        
        def add_block():
            self.increase_stochasticity()
            return InverseBottleNeck(512, 1024, self.dev, stochastic_depth = self.cur_stochastic, framework = self.framework, attention_type = self.attention_type, act = act)
        self.block7 = nn.Sequential(*[
            InverseBottleNeckDownSampler(448, 1536, 512, 2, self.dev, attention_type = self.attention_type, act = act)
        ] + [
            add_block() for i in range(3)
        ])
        self.proj = ConvBlock(512, 1536, 1, 0, 1, 1, framework = self.framework, act =act)
        self.global_avg = nn.AdaptiveAvgPool2d((1, 1))
        self.increase_drop_prob()
        self.Final_Drop = nn.Dropout(self.drop_prob)
        self.Linear = nn.Linear(1536, self.num_classes)
    def forward(self, x):
        features0 = self.bn1(self.act1(self.conv1(x))) # (48, 160, 160)
        block0 = self.block0(features0) # (B, 24, 160, 160)
        block1 = self.block1(block0) # (B, 32, 80, 80)
        block2 = self.block2(block1) # (B, 56, 40, 40)
        # Attention0
        block2 = self.Dropout0(block2)
        block2 = self.Attention0(block2)
        
        block3 = self.block3(block2) # (B, 112, 20, 20)
        block4 = self.block4(block3) # (B, 160, 20, 20)
        # Attention1
        block4 = self.Dropout1(block4)
        block4 = self.Attention1(block4)
        
        block5 = self.block5(block4) # (B, 272, 10, 10)
        block6 = self.block6(block5) # (B, 448, 10, 10)
        # Attention2 
        block6 = self.Dropout2(block6)
        block6 = self.Attention2(block6) 
        
        # Custom Layer7
        block7 = self.block7(block6)
        proj = self.proj(block7)
        # Global Average
        global_avg = torch.squeeze(self.global_avg(proj))
        global_avg = self.Final_Drop(global_avg)
        return self.Linear(global_avg)

In [18]:
class BaseLineMobileNet(pl.LightningModule):
    def freeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = False
    def unfreeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = True
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.model_name = 'mobilenetv3_large_100'
        self.model = timm.create_model(self.model_name, pretrained = True)
        
        # Block Extraction
        self.conv1 = self.model.conv_stem
        self.bn1 = self.model.bn1
        self.act1 = self.model.act1
        
        self.block0 = self.model.blocks[0]
        self.block1 = self.model.blocks[1]
        self.block2 = self.model.blocks[2]
        self.block3 = self.model.blocks[3]
        self.block4 = self.model.blocks[4]
        self.block5 = self.model.blocks[5]
        self.block6 = self.model.blocks[6]
        
        #self.freeze([self.conv1, self.bn1, self.block0, self.block1, self.block2, self.block3, self.block4, self.block5, self.block6])
        self.global_avg = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(960, self.num_classes)
    def forward(self, x):
        features0 = self.bn1(self.act1(self.conv1(x)))
        block0 = self.block0(features0)
        block1 = self.block1(block0)
        block2 = self.block2(block1)
        block3 = self.block3(block2)
        block4 = self.block4(block3)
        block5 = self.block5(block4)
        block6 = self.block6(block5)
        avg = torch.squeeze(self.global_avg(block6))
        return self.fc(avg)
        

# ARCFACE MODULE, Trying to differentiate between samples.

In [19]:
class ArcFaceModule(pl.LightningModule):
    '''
    Given some model, it takes the logits, and manipulates them to add a margin and make the difference between samples larger.
    
    Different actions at test and train time.
    
    Experimental, it didn't work too great.
    '''
    def __init__(self, model, num_classes):
        super().__init__()
        self.model = model
        self.num_classes = num_classes
        self.Linear = nn.Linear(self.num_classes, self.num_classes, bias = False)
        self.margin = 0.2 # about 10 degrees away in radians(7700 classes is a lot, so margins cant be too large)
        self.eps = 1e-7
    def forward_train(self, x, y):
        logits = self.model(x)
        # Extract Weights and Normalize
        weights = F.normalize(self.Linear.weight)
        norm_logits = F.normalize(logits)
        
        cos = F.linear(norm_logits, weights, bias = None)
        # Clip to prevent numeric stability errors
        cos = cos.clip(-1 + self.eps, 1 - self.eps)
        arccos = cos.arccos()
        
        # Add Margin
        arccos = arccos + F.one_hot(y, num_classes = self.num_classes) * self.margin
        # Convert back to cos
        cos = arccos.cos()
        return logits, F.cross_entropy(cos, y)
    def forward(self, x):
        return self.model(x)

BaseLine Training code with PyTorch Lightning:

In [20]:
class BaseLineOptimConfig:
    optimizer = 'adam'
    act = 'relu'
    weight_decay = 1e-3
    lr = 3e-4
    eta_min = 1e-7 # Cosine Annealing LR
    num_steps = 5 # StepLR and Cosine Annealing LR
     
    step = 0.95 # StepLR
    model_type = 'baseline'

Training Code

In [21]:
class PyTorchLightningBaseLineQT(pl.LightningModule):
    def __init__(self, num_classes, dev, fold_num = 0):
        super().__init__()
        self.num_classes = num_classes
        self.dev = dev
        self.fold_num = fold_num
        # HYPERPARAMETERS ----------------------------
        self.framework = 'pytorch' # Fast AI's implementation ironically takes 2GB more memory.
        self.increase_dropout = 0.0
        self.stochastic_depth = False
        self.learning_rate = BaseLineOptimConfig.lr
        # END OF HYPERPARAMETERS ---------------------
        self.liveloss = livelossplot.PlotLosses()
        # Send Model to Device
        self.model = self.configure_model()
        self.optim = self.configure_optimizers()[0]
        self.to(self.dev)
        # Internal States
        self.bestValLoss = float('inf')
        self.bestValAcc = 0
        # Per Epoch States
        self.trainLoss = 0
        self.valLoss = 0
        self.trainAcc = 0
        self.valAcc = 0
        
        self.trainCount = 0
        self.valCount = 0
        self.curEpoch = 0
    def inference(self, x):
        self.eval()
        with torch.no_grad():
            return F.softmax(self.model(x))
    def forward(self, x):
        return self.model(x)
    def configure_optimizers(self):
        '''
        Loads Optimizers and LR_scheduler for the model.
        '''
        # Creates an Optimizer for the model 
        if BaseLineOptimConfig.optimizer == 'ranger':
            # Ranger(Radam + Lookahead)
            optimizer = Ranger(self.parameters(), lr = self.learning_rate, weight_decay = BaseLineOptimConfig.weight_decay)
        else:
            # Adam
            optimizer = optim.Adam(self.parameters(), lr = self.learning_rate, weight_decay = BaseLineOptimConfig.weight_decay)
        # Load in LR_Scheduler
        self.lr_decay = optim.lr_scheduler.CosineAnnealingLR(optimizer, BaseLineOptimConfig.num_steps, eta_min = BaseLineOptimConfig.eta_min)
            
        # Load in Second LR Scheduler
        self.lr_decay2 =  optim.lr_scheduler.StepLR(optimizer, BaseLineOptimConfig.num_steps, BaseLineOptimConfig.step)
            
        return [optimizer]
    def configure_model(self):
        '''
        Loads a New Model
        '''
        if BaseLineOptimConfig.model_type == 'resnet':
            model = BaseLineResNet(self.num_classes, self.dev, act = BaseLineOptimConfig.act, increase_dropout = self.increase_dropout, framework = self.framework, stochastic_depth = self.stochastic_depth)
        elif BaseLineOptimConfig.model_type == 'baseline':
            model = BaseLineMobileNet(self.num_classes)
        else:
            model = BaseLineEffNet(self.num_classes, self.dev, act = BaseLineOptimConfig.act, increase_dropout = self.increase_dropout, framework = self.framework, stochastic_depth = self.stochastic_depth)
        model = ArcFaceModule(model, self.num_classes) 
        return model
    def accuracy(self, y_pred, y_true):
        '''
        y_pred: Tensor(B, C)
        y_true: Tensor(B)
        Accuracy From Logits
        '''
        # Argmax
        B, C = y_pred.shape
        _, y_logits = torch.max(F.softmax(y_pred), dim = -1)
        acc = torch.sum((y_logits == y_true).int()) / B
        return acc
        
    def training_step(self, train_batch, batch_idx):
        '''
        One Training Step
        '''
        x, y = train_batch
        # Send Data to GPU
        x = x.to(self.dev)
        y = y.to(self.dev)
        
        logits, loss = self.model.forward_train(x, y)
        
        acc = self.accuracy(logits, y)

        # Log the data for LiveLossPlot
        print(f"STEP: {batch_idx}, L: {loss.item()}, A: {acc.item()}")
        self.trainLoss += loss.item()
        self.trainAcc += acc.item()
        self.trainCount += 1
        if batch_idx % 100 == 0:
            self.lr_decay.step()
            self.lr_decay2.step()
        del x, y, acc
        return loss
        
    def validation_step(self, val_batch, batch_idx):
        '''
        One Validation Step
        '''
        x, y = val_batch
        # Send Data to GPU
        x = x.to(self.dev)
        y = y.to(self.dev)
        pred, loss = self.model.forward_train(x)
        acc = self.accuracy(pred, y)
        # Logs For Early Stopping
        self.log('val_loss', loss.item())
        self.log('val_acc', acc.item())
        # Logs for Validation end
        self.valLoss += loss.item()
        self.valAcc += acc.item()
        self.valCount += 1
        del x, y, pred, loss, acc
    def update_states(self):
        # Per Epoch States
        self.trainLoss = 0
        self.valLoss = 0
        self.trainAcc = 0
        self.valAcc = 0
        
        self.trainCount = 0
        self.valCount = 0
        self.curEpoch += 1
    def create_logs(self):
        '''
        Creates Logs for LiveLossPlot
        '''
        logs = {}
        logs['loss'] = self.trainLoss
        logs['val_loss'] = self.valLoss
        logs['accuracy'] = self.trainAcc
        logs['val_accuracy'] = self.valAcc
        return logs
    def saveBest(self):
        '''
        If this is a best model, saves the state dictionaries
        '''
        if self.valLoss < self.bestValLoss:
            self.bestValLoss = self.valLoss
            torch.save(self.state_dict(), f'./fold_{self.fold_num}_loss.pth')
        elif self.valAcc > self.bestValAcc:
            self.bestValAcc = self.valAcc
            torch.save(self.state_dict(), f"./fold_{self.fold_num}_acc.pth")
    def training_epoch_end(self, outputs):
        self.lr_decay.step()
        self.lr_decay2.step()
    def validation_epoch_end(self, outputs):
        '''
        Logs all data to livelossplot and saves models when necessary(Improvement)
        '''
        # Divide Losses to get a per Step loss
        if self.valCount != 0:
            self.valLoss /= self.valCount
            self.valAcc /= self.valCount
        if self.trainCount != 0:
            self.trainLoss /= self.trainCount
            self.trainAcc /= self.trainCount
        # Round Values
        self.valLoss = round(self.valLoss, 3) 
        self.valAcc = round(self.valAcc, 3)
        
        self.trainLoss = round(self.trainLoss, 3)
        self.trainAcc = round(self.trainAcc, 3) # Round for cleaner Numbes
        
        # Update LiveLossPlot
        logs = self.create_logs()
        self.liveloss.update(logs)
        self.liveloss.send()
        # best state dict?
        self.saveBest()
        # Print Logs:
        print(f"E {self.curEpoch}, BL: {self.bestValLoss}, BA: {self.bestValAcc}, L:{self.trainLoss}, A: {self.trainAcc}, VL: {self.valLoss}, VA: {self.valAcc}")
        # Clear States
        self.update_states()
    def training_loop(self, trainloader, NUM_EPOCHS):
        for EPOCH in range(NUM_EPOCHS):
            self.train()
            for idx, (images, labels) in enumerate(trainloader):
                self.optim.zero_grad()
                loss = self.training_step((images, labels), idx) 
                loss.backward()
                self.optim.step()

Train Initial Network 

In [22]:
def construct_new(fold_idx, load_prev = None):
    model = PyTorchLightningBaseLineQT(Config.NUM_CLASSES, Config.device, fold_idx)
    if load_prev:
        model.load_state_dict(torch.load(f'{load_prev}fold_{fold_idx}_loss.pth', map_location = Config.device))
    # Create Trainer
    early_stopping = []
    
    trainer = pl.Trainer(max_epochs = Config.NUM_EPOCHS, checkpoint_callback = False, logger = None, check_val_every_n_epoch = 1, gpus = 1, num_sanity_val_steps = 0, callbacks = early_stopping, benchmark = False, deterministic = True, precision = 16)
    return model, trainer
def MultiFoldTrainPL(NUM_SPLITS):
    for fold_idx in range(NUM_SPLITS):
        train, val = dataModule.get_both(fold_idx)
        model = PyTorchLightningBaseLineQT(Config.NUM_CLASSES, Config.device, fold_idx)
        model.training_loop(train, val, Config.NUM_EPOCHS)
def BaseLineMultiFoldTrain(fold_idx, load_prev = None):
    train_loader, val_loader = dataModule.get_both(fold_idx)
    # Create New Model
    model, trainer = construct_new(fold_idx, load_prev = load_prev)
    trainer.fit(model, train_loader, val_loader)

In [23]:
#pl.seed_everything()
#print(BaseLineMultiFoldTrain(0, load_prev = None))

# NFNet Blocks from Scratch

In [60]:
class ScaledReLU(pl.LightningModule):
    '''
    Scaled ReLU block, used to reduce the size of activations after a NFBlock.
    
    Decided sort of arbitrarily from the paper writers. Scale by constant: (math.sqrt(2 / (1 - (1 / pi))))
    '''
    def __init__(self):
        super().__init__()
        self.act1 = nn.ReLU(inplace = True)
        self.constant = math.sqrt(2 / (1 - (1 / math.pi)))
    def forward(self, x):
        return self.act1(x) * self.constant # Scale Down Magnitude of RELU Activation

class WSConv(pl.LightningModule):
    '''
    Weight Standardized Convolutional Block
    '''
    def __init__(self, in_features, out_features, kernel_size, padding, groups, stride):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.kernel_size = kernel_size
        self.padding = padding
        self.groups = groups
        self.stride = stride
        self.conv = nn.Conv2d(in_features, out_features, kernel_size = kernel_size, padding = padding, groups = groups, stride = stride)
    def forward(self, x):
        '''
        Simply Standardizes the Weights before applying Conv2d.
        '''
        out_cn, A, B, C = self.conv.weight.shape
        norm_weights = self.conv.weight#F.layer_norm(self.conv.weight, (A, B, C))
        return F.conv2d(x, weight = norm_weights, bias = self.conv.bias, stride = self.stride, padding = self.padding, groups = self.groups)
class WSConvBlock(pl.LightningModule):
    '''
    Encloses all Parts related to Weight-Standardized Convolutions and Scaled RELU
    
    No BatchNorm Here!
    '''
    def __init__(self, in_features, out_features, kernel_size, padding, groups, stride, activation = True):
        super().__init__()
        self.conv = WSConv(in_features, out_features, kernel_size, padding, groups, stride)
        self.act1 = ScaledReLU()
    def forward(self, x):
        return self.act1(self.conv(x))

class ScaledSE(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.dev = dev
        
        self.Squeeze = nn.Linear(self.in_features, self.inner_features)
        self.act1 = ScaledReLU()
        self.Excite = nn.Linear(self.inner_features, self.in_features)
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        mean = torch.mean(x, dim = -1)
        mean = torch.mean(mean, dim = -1)
        squeezed = self.act1(self.Squeeze(mean))
        excited = torch.sigmoid(self.Excite(squeezed)).unsqueeze(-1).unsqueeze(-1)
        return excited * x * self.gamma + x * (1 - self.gamma) 
    
class ScaledCBAM(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features 
        self.dev = dev
    
        self.Squeeze = nn.Linear(self.in_features, self.inner_features) 
        self.act1 = ScaledReLU()
        self.Excite = nn.Linear(self.inner_features, self.in_features)
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        mean = torch.mean(x, dim = -1)
        mean = torch.mean(mean, dim = -1)
        max_pooled, _ = torch.max(x, dim = -1)
        max_pooled, _ = torch.max(max_pooled, dim = -1)
        
        squeeze_mean = self.act1(self.Squeeze(mean))
        excite_mean = self.Excite(squeeze_mean)
        
        squeeze_max = self.act1(self.Squeeze(max_pooled))
        excite_max = self.Excite(squeeze_max) 
        
        excite = torch.sigmoid((excite_max + excite_mean) / 2).unsqueeze(-1).unsqueeze(-1)
        return excite * x * self.gamma + x * (1 - self.gamma)
class ScaledAttention(pl.LightningModule):
    def __init__(self, in_features, inner_features, dev, attention_type = 'se'):
        super().__init__()
        self.attention_type = attention_type
        assert self.attention_type in ['se', 'cbam']
        if self.attention_type == 'se':
            self.layer = ScaledSE(in_features, inner_features, dev)
        else:
            self.layer = ScaledCBAM(in_features, inner_features, dev)
    def forward(self, x):
        return self.layer(x)

In [61]:
class NFBlock(pl.LightningModule):
    '''
    Replaces the Need for a BottleNeck Block, effectively a InverseBottleNeck with a variety of other tricks to replace BN.
    '''
    def __init__(self, in_features, inner_features, dev, layer_num = 0, attention_type = 'se', stochastic_depth = 0.2):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.dev = dev
        self.attention_type = attention_type
        self.layer_num = layer_num
        self.stochastic_depth = stochastic_depth
        
        # Parameters
        self.alpha = 0.2 
        self.beta = math.sqrt(1 + self.layer_num * self.alpha ** 2)
        # Custom Layers
        self.expand = WSConvBlock(self.in_features, self.inner_features, 1, 0, 1, 1)
        self.dw = WSConvBlock(self.inner_features, self.inner_features, 3, 1, self.inner_features, 1) 
        self.squeeze = WSConvBlock(self.inner_features, self.in_features, 1, 0, 1, 1)
        self.SE = ScaledAttention(self.in_features, self.in_features // 4, self.dev, attention_type = self.attention_type)
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        # Stochastic Depth
        if self.training and random.random() < self.stochastic_depth:
            return x
        # Scale Down the Initial value to reduce to unit-variance
        x = x / self.beta
        
        expand = self.expand(x)
        dw = self.dw(expand)
        squeeze = self.squeeze(dw)
        SE = self.SE(squeeze) * self.alpha
        return self.gamma * SE + (1 - self.gamma) * x      

In [62]:
class NFDownsamplerBlock(pl.LightningModule):
    def __init__(self, in_features, inner_features, out_features, stride, dev, layer_num = 0, attention_type = 'se'):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.out_features = out_features
        self.stride = stride
        self.dev = dev
        self.layer_num = layer_num
        self.attention_type = attention_type
        
        # Scaling Params
        self.alpha = 0.2
        self.beta = math.sqrt(1 + self.layer_num * self.alpha ** 2) 
    
        # Custom Layers 
        self.pool = nn.AvgPool2d(kernel_size = 3, padding = 1, stride = self.stride)
        self.conv_pool = WSConvBlock(self.in_features, self.out_features, 1, 0, 1, 1)
        
        self.expand = WSConvBlock(self.in_features, self.inner_features, 1, 0, 1, 1)
        self.dw = WSConvBlock(self.inner_features, self.inner_features, 3, 1, self.inner_features, self.stride)
        self.squeeze = WSConvBlock(self.inner_features, self.out_features, 1, 0, 1, 1)
        
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        # Scale Down Activation
        x = x / self.beta
    
        pool = self.conv_pool(self.pool(x))
        
        expand = self.expand(x)
        dw = self.dw(expand)
        squeeze = self.squeeze(dw)
        return (self.gamma * squeeze + (1 - self.gamma) * pool) * self.alpha

In [63]:
# Normalizer-Free Resizer 
class NFResizer(pl.LightningModule):
    def __init__(self, out_size, dev):
        super().__init__()
        self.out_size = out_size
        self.dev = dev
        # HYPER PARAMETERS ----------------------
        self.stochastic_depth = 0 
        self.attention_type = 'se'
        # END OF HYPER PARAMETERS ---------------
        self.initial = nn.Sequential(*[
            WSConvBlock(3, 8, 3, 1, 1, 1),
            WSConvBlock(8, 16, 1, 0, 1, 1)
        ])
        
        self.process = nn.Sequential(*[
            NFBlock(16, 32, self.dev, layer_num = i, attention_type = self.attention_type, stochastic_depth = self.stochastic_depth) for i in range(1)
        ])
        self.proj = WSConvBlock(16, 3, 3, 1, 1, 1)
        
        self.gamma = nn.Parameter(torch.zeros((1), device = self.dev))
    def forward(self, x):
        '''
        Normalizer Free Resizer Module(To preserve information from 1024x1024 images and try to autoencode them into 320x320)
        '''
        # Standard Resize
        resize = F.interpolate(x, size= (self.out_size, self.out_size), mode = 'bilinear')
        
        initial = self.initial(x)
        resized_initial= F.interpolate(initial, size = (self.out_size, self.out_size), mode = 'bilinear')
        process = self.process(resized_initial)
        proj = self.proj(process)
        
        return proj * self.gamma + (1 - self.gamma) * resize

In [64]:
class ModifiedNFNet_f0(pl.LightningModule):
    '''
    Modifies and Transfer Learns on the NFNet_f0(13 M parameters pretrained, 3M from scratch
    Heavily Modified to reduce memory and make it feasibly trainable.
    '''
    def freeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = False
    def unfreeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = False 
    def increase_drop_prob(self):
        self.drop_prob += self.increase_dropout
    def increase_layer_num(self):
        self.initial_layer_num += 1
    def increase_stochasticity(self):
        self.start_stochastic += self.increase_stochastic
    def __init__(self, num_classes, dev, increase_dropout = 0.1, attention_type = 'se', stochastic_depth = False):
        super().__init__()
        self.num_classes = num_classes
        self.dev = dev 
        self.drop_prob = 0
        self.increase_dropout= increase_dropout
        self.attention_type = attention_type
        self.stochastic_depth = stochastic_depth
        # HYPER PARAMETERS -----------------------
        if self.stochastic_depth:
            self.start_stochastic = 0
            self.increase_stochastic = 0.05 # Each layer, stochasticity increases to reduce param sizes.
        else:
            self.start_stochastic = 0
            self.increase_stochastic = 0.00
        # END OF HYPER PARAMETERS ----------------
        self.model_name = 'dm_nfnet_f0'
        self.model = timm.create_model(self.model_name, pretrained = True) 
        # Extract Layers
        self.stem = self.model.stem # (B, 128, 160, 160)
        self.layer1 = self.model.stages[0] # (B, 256, 80, 80)
        self.layer2 = self.model.stages[1] # (B, 512, 40, 40)
        #self.freeze([ self.stem, self.layer1, self.layer2])
        # Custom Layers
        self.initial_layer_num = 3 # 4th layer in NFNet(3 0-based counting)
        
        def add_layer(x):
            # Returns x, but increments layer num and stochasticity.
            self.increase_layer_num()
            self.increase_stochasticity()
            return x
        def add_layer_no_stoc(x):
            self.increase_layer_num()
            return x
        self.Attention2 = ScaledAttention(512, 128, self.dev, attention_type = self.attention_type)
        self.Dropout2 = nn.Dropout2d(self.drop_prob)
        self.increase_drop_prob()
        
        self.layer3 = nn.Sequential(*[
            add_layer_no_stoc(NFDownsamplerBlock(512, 768, 640, 2, self.dev, layer_num = self.initial_layer_num, attention_type = self.attention_type))
        ] + [
            add_layer(NFBlock(640, 768, self.dev, layer_num = self.initial_layer_num, attention_type = self.attention_type, stochastic_depth = self.start_stochastic)) for i in range(5)
        ])
        self.Attention3 = ScaledAttention(640, 196, self.dev, attention_type = self.attention_type)
        self.Dropout3 = nn.Dropout2d(self.drop_prob)
        self.increase_drop_prob()
        
        self.layer4 = nn.Sequential(*[
          add_layer_no_stoc(NFDownsamplerBlock(640, 1024, 768, 2, self.dev, layer_num = self.initial_layer_num, attention_type = self.attention_type))  
        ] + [
            add_layer(NFBlock(768, 1024, self.dev, layer_num = self.initial_layer_num, attention_type = self.attention_type, stochastic_depth = self.start_stochastic)) for i in range(3) 
        ])
        self.Attention4 = ScaledAttention(768, 256, self.dev, attention_type = self.attention_type)
        self.Dropout4 = nn.Dropout2d(self.drop_prob)
        self.increase_drop_prob()
        
        self.layer5 = nn.Sequential(*[
            add_layer_no_stoc(NFDownsamplerBlock(768, 1256, 1024, 2, self.dev, layer_num = self.initial_layer_num, attention_type = self.attention_type))
        ] + [
            add_layer(NFBlock(1024, 1536, self.dev, layer_num = self.initial_layer_num, attention_type = self.attention_type, stochastic_depth = self.start_stochastic)) for i in range(2)
        ])
        self.proj = WSConvBlock(1024, 2048, 1, 0, 1, 1)
        
        self.global_avg = nn.AdaptiveAvgPool2d((1, 1))
        self.increase_drop_prob()
        self.FinalDropout = nn.Dropout(self.drop_prob)
        self.Linear = nn.Linear(2048, self.num_classes)
        del self.model
        
    def forward(self, x):
        stem = self.stem(x) # (B, 128, 160, 160)
        layer1 = self.layer1(stem) # (B, 256, 80, 80)
        layer2 = self.layer2(layer1) # (B, 512, 40, 40)
        # Attention 2
        layer2 = self.Dropout2(layer2)
        layer2 = self.Attention2(layer2) # (B, 512, 40, 40)
        
        layer3 = self.layer3(layer2) # (B, 768, 20, 20)
        # Attention 3
        layer3 = self.Dropout3(layer3)
        layer3 = self.Attention3(layer3)
        
        layer4 = self.layer4(layer3)
        # Attention 4
        layer4 = self.Dropout4(layer4)
        layer4 = self.Attention4(layer4)
        
        layer5 = self.layer5(layer4)
        proj = self.proj(layer5)
        # Pool
        pooled = torch.squeeze(self.FinalDropout(self.global_avg(proj)))
        return self.Linear(pooled)

In [65]:
# BaseLine Model For Debugging, Pure Transfer Learned
class BaseLineNFNet(pl.LightningModule):
    def freeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = False
    def unfreeze(self, layers):
        for layer in layers:
            for parameter in layer.parameters():
                parameter.requires_grad = True
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.model_name = 'dm_nfnet_f0'
        self.model = timm.create_model(self.model_name, pretrained = True)
        
        # Extract Layers
        self.stem = self.model.stem
        self.stage0 = self.model.stages[0]
        self.stage1 = self.model.stages[1]
        self.stage2 = self.model.stages[2]
        
        # Freeze(Optional) Layers
        self.freeze([self.stem, self.stage0])
        # Custom Layers
        self.proj = WSConv(1536, 3072, 3, 1, 1, 1)
        self.global_avg = nn.AdaptiveAvgPool2d((1, 1))
    
        self.Linear = nn.Linear(3072, self.num_classes)
        del self.model
    def forward(self, x):
        features0 = self.stem(x)
        stage0 = self.stage0(features0)
        stage1 = self.stage1(stage0)
        stage2 = self.stage2(stage1)
        
        proj = self.proj(stage2)
        avg = torch.squeeze(self.global_avg(proj))
        return self.Linear(avg)
        

In [66]:
class AdaptiveGradientClipping(optim.Optimizer):
    '''
    Custom Optimizer to Clip Gradients on All Layers except for the FC.
    '''
    def unitnorm(self, x):
        '''
        Computes the unit norm of a vector
        
        Computes the norm over every dim but the first
        '''
        if x.ndim <= 1:
            # Scalar or Tensor of shape (B), dont keep dim, just return a scalar
            dim = [0]
            keep_dim = False
        else:
            dim = [i + 1 for i in range(len(x.shape) - 1)]
            keep_dim = True
        x = x ** 2
        for d in dim:
            x = torch.sum(x, dim = d).unsqueeze(d)
        return x ** 0.5
         
    def __init__(self, model, optim, clip_limit = 1e-2, eps = 1e-5, ignore = "fc"):
        self.model = model 
        self.optim = optim
        self.clip_limit = clip_limit
        self.eps = eps
        self.param_groups = self.optim.param_groups
        self.ignore = ignore # Should be the final MLP
        # Extract Parameters out of Model
        self.params = [{'params': list(module.parameters())} for name, module in model.named_modules() if name != ignore]
    def step(self):
        '''
        Iterates through all parameters, clipping gradients adaptively.
        '''
        for p in self.params[0]['params']:
            if p.grad is None:
                continue
            # Weight Stored in P, p.grad stores gradients
            grad_norm = self.unitnorm(p.grad.detach() * 1)
            weight_norm = self.unitnorm(p.detach() * 1)
            
            normed = grad_norm / weight_norm # (B)
            threshold = normed > self.clip_limit
            
            clipped = p.grad * self.clip_limit * (weight_norm / torch.max(grad_norm, torch.ones_like(grad_norm, device = grad_norm.device) * self.eps))
            if clipped.shape[0] == 1 and len(clipped.shape) == 1:
                clipped = torch.squeeze(clipped)
                threshold = torch.squeeze(threshold)
            p.grad.detach().data.copy_(torch.where(threshold, clipped, p.grad))
        self.optim.step()
            
            
    def zero_grad(self):
        self.optim.zero_grad() # Use the Normal One
        

# PyTorch Lightning Training

In [67]:
class NFNetQTPiConfig:
    num_classes = Config.NUM_CLASSES
    device = Config.device
    increase_dropout = 0.0
    stochastic_depth = False
    attention_type = 'se'
    
    model_type = 'baseline'
    lr = 1e-3
    optim = 'adam'
    weight_decay = 1e-3
    
    num_steps = 5
    eta_min = 1e-7
    step_size = 0.95

In [68]:
class NFNetQTPi(pl.LightningModule):
    def __init__(self, dev, fold_idx = 0):
        super().__init__()
        self.dev = dev
        self.fold_idx = fold_idx
        self.config = NFNetQTPiConfig
        
        self.model_type = self.config.model_type
        assert self.model_type in ['baseline', 'QTPi']
        self.model = self.configure_model()
        self.criterion = nn.CrossEntropyLoss()
        # Internal States
        self.train_loss = 0
        self.train_acc = 0
        self.train_steps = 0
        
        self.val_loss = 0
        self.val_acc = 0
        self.val_steps = 0
    
        self.best_val_loss = 0
        self.best_val_acc = 0
        self.num_epochs = 0
        self.liveloss = livelossplot.PlotLosses()
        
        self.to(self.device)
    def configure_optimizers(self):
        if self.config.optim == 'adam':
            optimizer = optim.Adam(self.model.parameters(), lr = self.config.lr, weight_decay = self.config.weight_decay)
        else:
            optimizer = Ranger(self.model.parameters(), lr = self.config.lr, weight_decay = self.config.weight_decay)
        optimizer = AdaptiveGradientClipping(self.model, optimizer, ignore = "Linear")
        self.lr_decay = optim.lr_scheduler.StepLR(optimizer.optim, self.config.num_steps, self.config.step_size)
        self.lr_decay2 = optim.lr_scheduler.CosineAnnealingLR(optimizer.optim, self.config.num_steps, eta_min = self.config.eta_min)
        return [optimizer]
    def configure_model(self):
        '''
        Loads in the Model
        '''
        if self.model_type == 'baseline':
            model = BaseLineNFNet(self.config.num_classes)
        else:
            model = ModifiedNFNet_f0(self.config.num_classes, self.config.device, increase_dropout = self.config.increase_dropout, attention_type = self.config.attention_type, stochastic_depth = self.config.stochastic_depth)
        #model = ArcFaceModule(model, self.config.num_classes)
        return model
    def reset_states(self):
        self.train_loss = 0
        self.train_acc = 0
        self.train_steps = 0
        
        self.val_loss = 0
        self.val_acc = 0
        self.val_steps = 0
        
        self.num_epochs += 1
    def fix_states(self):
        if self.train_steps != 0:
            self.train_loss /= self.train_steps
            self.train_acc /= self.train_steps
        if self.val_steps != 0:
            self.val_loss /= self.val_steps
            self.val_acc /= self.val_steps
        
        self.train_loss = round(self.train_loss, 3)
        self.train_acc = round(self.train_acc, 3)
        
        self.val_loss = round(self.val_loss, 3)
        self.val_acc = round(self.val_acc, 3)
    def accuracy(self, pred, y):
        
        pred = F.softmax(pred)
        _, indices = torch.max(pred, dim = -1)
        B = indices.shape[0]
        return torch.sum((indices == y).int()) / B
    def training_step(self, batch, batch_idx):
        x, y = batch
        x = x.to(self.dev)
        y = y.to(self.dev)
        
        pred = self.model(x)
        loss = self.criterion(pred, y)
        acc = self.accuracy(pred, y)
        print(f"STEP: {batch_idx}, L: {round(loss.item(), 3)}, A: {round(acc.item(), 3)}")
        self.train_loss += loss.item()
        self.train_acc += acc.item()
        self.train_steps += 1
        if batch_idx % 100 == 0:
            self.lr_decay.step()
            self.lr_decay2.step()
        del x, y, acc
        return loss
    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = x.to(self.dev)
        y = y.to(self.dev)
        
        pred = self.model(x)
        loss = self.criterion(pred, y)
        acc = self.accuracy(pred, y)
        
        self.log('val_loss', loss.item())
        self.log('val_acc', acc.item())
        
        self.val_loss += loss.item()
        self.val_acc += acc.item()
        del x, y, pred, loss, acc
    def training_epoch_end(self, _):
        self.lr_decay.step()
        self.lr_decay2.step()
    def save_states(self):
        if self.val_loss <= self.best_val_loss:
            self.best_val_loss = self.val_loss
            torch.save(self.state_dict(), f'./fold_{self.fold_idx}_loss.pth')
        if self.val_acc >= self.best_val_acc:
            self.best_val_acc = self.val_acc
            torch.save(self.state_dict(), f'./fold_{self.fold_idx}_acc.pth')
    def generate_logs(self):
        logs = {}
        logs['loss'] = self.train_loss
        logs['val_loss'] = self.val_loss
        logs['accuracy'] = self.train_acc
        logs['val_accuracy'] = self.val_acc
        self.liveloss.update(logs)
        self.liveloss.send()
    def validation_epoch_end(self, _):
        self.fix_states()
        self.reset_states()

In [69]:
def trainNF(fold_idx):
    '''
    Trains one Fold, since everything is seeded, just tweak the number to change folds.
    '''
    train, val = dataModule.get_both(fold_idx)
    model = NFNetQTPi(NFNetQTPiConfig.device, fold_idx = fold_idx)
    early_stopping = []
    trainer = pl.Trainer(check_val_every_n_epoch = 1, max_epochs = Config.NUM_EPOCHS, checkpoint_callback = False, logger = None, gpus = 1, num_sanity_val_steps = 0, callbacks = early_stopping, benchmark = False, deterministic = True, precision = 16)
    print(f"START OF TRAINING")
    trainer.fit(model, train, val)

In [70]:
pl.seed_everything()
trainNF(0)

START OF TRAINING


Training: 0it [00:00, ?it/s]

STEP: 0, L: 9.196, A: 0.0
STEP: 0, L: 9.918, A: 0.031
STEP: 0, L: 9.775, A: 0.031
STEP: 0, L: 9.817, A: 0.062
STEP: 0, L: 9.937, A: 0.031
STEP: 0, L: 96.299, A: 0.031
STEP: 0, L: 93.462, A: 0.031


In [None]:
import gc
gc.collect()