# Import Dependencies

In [None]:
%%capture
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
import numpy as np
import pandas as pd
import cv2
import PIL

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

!pip install livelossplot
import livelossplot

!pip install timm
import timm

# Import Ranger Optimizer
%cd ..
!git clone https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer 
%cd Ranger-Deep-Learning-Optimizer
!pip install -e .
%cd ..
%cd working
import sys
sys.path.append("../Ranger-Deep-Learning-Optimizer")
from ranger import Ranger

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import os
import random

import tqdm.notebook as tqdm
import copy
import albumentations as A
from albumentations.pytorch import ToTensorV2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Reproducibility

In [None]:
def seed_all():
    seed = 42
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)
seed_all()

In [None]:
# HYPERPARAMETERS
dataframe_path = "../input/plant-pathology-2021-fgvc8/train.csv"
BASE_PATH = "../input/plant-pathology-2021-fgvc8/train_images/"
BATCH_SIZE = 24
TEST_BATCH_SIZE = 32
INPUT_WIDTH = 320
IMAGE_SIZE = 320
NUM_FOLDS = 3
INPUT_HEIGHT = int(INPUT_WIDTH *1.5)

In [None]:
train_transforms = A.Compose([
    A.RandomResizedCrop(INPUT_WIDTH, INPUT_HEIGHT, scale=(0.6, 0.8), p=1),
    A.Flip(p = 0.7),
    A.OneOf([
        A.MotionBlur(blur_limit=(3, 5)),
        A.MedianBlur(blur_limit=5),
        A.GaussianBlur(blur_limit=(3, 5)),
        A.GaussNoise(var_limit=(5.0, 30.0)),
        A.MultiplicativeNoise(),
    ], p=0.7),
    A.OneOf([
        A.OpticalDistortion(distort_limit=1.0),
        A.GridDistortion(num_steps=5, distort_limit=1.),
        A.ElasticTransform(alpha=3),
    ], p=0.7),
    A.CLAHE(clip_limit=4.0, p=0.7),
    A.IAASharpen(p=0.5),
    A.ColorJitter(p = 0.7),
    A.OneOf([
        A.ImageCompression(),
        A.Downscale(scale_min=0.7, scale_max=0.95),
    ], p=0.2),
    A.OneOf([
        A.ToGray(),
        A.ToSepia()
    ]),
    A.CoarseDropout(max_holes=8, max_height=int(INPUT_HEIGHT * 0.1),
                       max_width=int(INPUT_WIDTH* 0.1), p=0.5),
    A.Cutout(num_holes = 32),
    A.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=45, border_mode=0, p=0.85),
    A.Normalize(),
    ToTensorV2()
])

test_transforms = A.Compose([
    A.Resize(INPUT_WIDTH, INPUT_HEIGHT),
    A.Normalize(),
    ToTensorV2()
])

# Process Dataset

In [None]:
dataFrame = pd.read_csv(dataframe_path)
dataFrame = dataFrame.set_index("image")
def process_dataframe(dataframe):
    '''
    dataFrame: pandas dataframe containing image ids and labels
    '''
    classes = []
    count = 0
    for row in dataframe.iterrows():
        labels = str.split(row[1][0])
        classes += labels
    classes = sorted(list(set(classes)))
    classes.remove('healthy')
    return classes # Multi Class Classification Over 5 Classes
CLASSES = process_dataframe(dataFrame)

In [None]:
class PlantDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, classes, base_path, transforms):
        self.dataframe = dataframe
        self.transforms = transforms
        self.classes = classes
        self.base_path = base_path
        # Internal Dictionaries
        self.class2idx = {}
        self.idx2class = {}
        for class_idx in range(len(self.classes)):
            self.idx2class[class_idx] = self.classes[class_idx]
            self.class2idx[self.classes[class_idx]] = class_idx
        self.num_classes = len(self.class2idx) # Healthy is Omitted, added at test-time if no other predictions present
    def __len__(self):
        return len(self.dataframe)
    def __getitem__(self, idx):
        # Select Image
        row = self.dataframe.iloc[idx]
        img_id = row.name
        values = str.split(row[0])
        # Generate Tensor
        classes = torch.zeros((self.num_classes))
        for class_val in values:
            if class_val == 'healthy':
                break
            else:
                classes[self.class2idx[class_val]] = 1
        # Load image in
        img_path = self.base_path + img_id
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) 
        image = self.transforms(image = image)['image']
        return torch.tensor(image), classes
        

In [None]:
# Generate 3 Folds
splitter = KFold(n_splits = 70, shuffle = True, random_state = 42)
KSPLITS = []
count = 0
for train_idx, test_idx in splitter.split(dataFrame):
    train_idx = np.array(train_idx)
    test_idx = np.array(test_idx) 
    
    train_dataframe = dataFrame.iloc[train_idx]
    test_dataframe = dataFrame.iloc[test_idx]
    KSPLITS += [(train_dataframe, test_dataframe)]
    count += 1
    if count == NUM_FOLDS:
        break

In [None]:
def display_image(image):
    plt.imshow(image.cpu().transpose(0, 1).transpose(1, 2))
    plt.show()

# Model Creation: Part 0 Down Sampler(No CNN)

Handy Convolutional Blocks

In [None]:
class CustomLoss(nn.Module):
    '''
    Separates BCE loss into 0s and 1 to weight them evenly(1s are sparse)
    '''
    def __init__(self):
        super().__init__()
        self.criterion = nn.BCEWithLogitsLoss()
    def forward(self, pred, y_true):
        one_bools = y_true == 1
        zero_bools = y_true == 0
        pred_one = pred[one_bools]
        pred_zeros = pred[zero_bools]
        
        one_loss = self.criterion(pred_one, torch.ones_like(pred_one, device = pred_one.device))
        zero_loss = self.criterion(pred_zeros, torch.zeros_like(pred_zeros, device = pred_zeros.device))
        
        return one_loss + zero_loss

In [None]:
class StridedConvBlock(nn.Module):
    def __init__(self, in_features, out_features, kernel_size, padding, groups, stride):
        super().__init__()
        self.conv = nn.Conv2d(in_features, out_features, kernel_size = kernel_size, padding = padding, groups = groups, stride = stride)
        self.bn = nn.BatchNorm2d(out_features)
        self.act1 = nn.SiLU(inplace = True)
    def forward(self, x):
        return self.bn(self.act1(self.conv(x)))

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_features, out_features, kernel_size, padding, groups):
        super().__init__()
        self.conv = nn.Conv2d(in_features, out_features, kernel_size = kernel_size, padding = padding, groups = groups) 
        self.bn = nn.BatchNorm2d(out_features)
        self.act1 = nn.SiLU(inplace = True)
    def forward(self, x):
        return self.bn(self.act1(self.conv(x)))

In [None]:
class CBAMChannel(nn.Module):
    def __init__(self, in_dim, inner_dim):
        super().__init__()
        self.in_dim = in_dim 
        self.inner_dim = inner_dim
        
        self.Squeeze = nn.Linear(self.in_dim, self.inner_dim)
        self.Excite = nn.Linear(self.inner_dim, self.in_dim)
        self.act1 = nn.SiLU(inplace = True)
    def forward(self, x):
        avg_pool = torch.mean(x, dim = -1)
        avg_pool = torch.mean(avg_pool, dim = -1)
        
        max_pool, _ = torch.max(x, dim = -1)
        max_pool, _ = torch.max(max_pool, dim = -1)
        
        avg_squeeze = self.act1(self.Squeeze(avg_pool))
        max_squeeze = self.act1(self.Squeeze(max_pool))
        
        avg_excite = self.Excite(avg_squeeze)
        max_excite = self.Excite(max_squeeze)
        
        logits = torch.sigmoid(avg_excite + max_excite).unsqueeze(-1).unsqueeze(-1)
        return logits * x

In [None]:
class SqueezeExcite(nn.Module):
    def __init__(self, in_dim, inner_dim):
        super().__init__()
        self.in_dim = in_dim
        self.inner_dim = inner_dim
        
        self.Squeeze = nn.Linear(self.in_dim, self.inner_dim)
        self.Excite = nn.Linear(self.inner_dim, self.in_dim)
        self.act1 = nn.SiLU(inplace = True) 
    def forward(self, x):
        max_pool, _ = torch.max(x, dim = -1) 
        max_pool, _ = torch.max(max_pool, dim = -1)
        
        squeezed = self.act1(self.Squeeze(max_pool))
        excite = torch.sigmoid(self.Excite(squeezed)).unsqueeze(-1).unsqueeze(-1)
        return excite * x
    

In [None]:
class BottleNeck(nn.Module):
    def __init__(self, in_features, inner_features, device, stochastic_depth = 0.2):
        super().__init__()
        self.stochastic_depth = stochastic_depth
        self.in_features = in_features
        self.inner_features = inner_features
        self.device = device
        
        self.Squeeze = ConvBlock(self.in_features, self.inner_features, 1, 0, 1)
        self.Process = ConvBlock(self.inner_features, self.inner_features, 3, 1, 1)
        self.Expand = ConvBlock(self.inner_features, self.in_features, 1, 0, 1)
        self.SE = SqueezeExcite(self.in_features, self.in_features // 16)
       
        self.gamma = nn.Parameter(torch.zeros((1), device = self.device))
    def forward(self, x):
        if self.training and random.random() < self.stochastic_depth:
            return x
        squeeze = self.Squeeze(x)
        process = self.Process(squeeze)
        expand = self.Expand(process)
        excited = self.SE(expand)
        return self.gamma * excited + x

In [None]:
class InvertedBottleNeck(nn.Module):
    def __init__(self, in_dim, inner_dim, device, stochastic_depth = 0.2):
        super().__init__()
        self.stochastic_depth = stochastic_depth
        self.device = device
        self.in_dim = in_dim
        self.inner_dim = inner_dim
        
        self.expand = ConvBlock(self.in_dim, self.inner_dim, 1, 0, 1)
        self.depthwise = ConvBlock(self.inner_dim, self.inner_dim, 3, 1, self.inner_dim)
        self.SE = SqueezeExcite(self.inner_dim, self.inner_dim // 16)
        self.squeeze = ConvBlock(self.inner_dim, self.in_dim, 1, 0, 1)
        
        self.gamma = nn.Parameter(torch.zeros((1), device = self.device))
    def forward(self, x):
        if self.training and random.random() < self.stochastic_depth:
            return x 
        expanded = self.expand(x)
        depthwise = self.depthwise(expanded)
        excited = self.SE(depthwise)
        squeezed = self.squeeze(excited)
        return self.gamma * squeezed + x

In [None]:
class RegDownSampler(nn.Module):
    '''
    Just standard resize operation
    '''
    def __init__(self, out_size, device):
        super().__init__()
        self.device = device
        self.out_size = out_size
    def forward(self, x):
        '''
        Bilinear Interpolation to resize images
        '''
        resized = F.interpolate(x, size = (self.out_size, self.out_size), mode = 'bilinear')
        return resized

In [None]:
class CNNDownSampler(nn.Module):
    '''
    Uses Convolution and Resizing to Downsample images
    '''
    def __init__(self, out_size, device):
        super().__init__()
        self.device = device
        self.out_size = out_size
        self.initialCNN = nn.Sequential(*[
            ConvBlock(3, 16, 1, 0, 1), ConvBlock(16, 16, 1, 0, 1)])
        
        self.post_processing = nn.Sequential(*[
            ConvBlock(16, 16, 1, 0, 1) for i in range(2) 
        ])
        self.proj = ConvBlock(16, 3, 1, 0, 1)
        self.gamma = nn.Parameter(torch.zeros((1), device = self.device))
    def forward(self, x):
        resized = F.interpolate(x, size = (self.out_size, self.out_size), mode = 'bilinear')
        initial_processing = self.initialCNN(x)
        # Resize
        resize_processed = F.interpolate(initial_processing, size = (self.out_size, self.out_size), mode = 'bilinear')
        # Further Processing
        further_processed = self.post_processing(resize_processed)
        proj = self.proj(further_processed)
        return proj * self.gamma + resized

BaseLine Model: MobileNetv3

In [None]:
class MobileNetv3(nn.Module):
    '''
    BaseLine MobileNet v3 Based Model
    '''
    def freeze(self, layer):
        for parameter in layer.parameters():
            parameter.requires_grad = False
    def __init__(self, out_size, num_classes, device, dropout = 0.2, stochastic_depth = 0.2):
        super().__init__()
        self.drop_prob = dropout
        self.stochastic_depth = stochastic_depth
        self.model_name = 'mobilenetv3_large_100'
        self.model = timm.create_model(self.model_name, pretrained = True)
        
        self.out_size = out_size
        self.num_classes = num_classes
        self.device = device
        self.downsampler = CNNDownSampler(self.out_size, self.device) 
        
        self.conv1 = self.model.conv_stem
        self.bn1 = self.model.bn1
        self.act1 = self.model.act1
        
        self.block0 = self.model.blocks[0] # (16)
        self.block1 = self.model.blocks[1] # (24)
        self.block2 = self.model.blocks[2] 
        self.block3 = self.model.blocks[3]
        self.block4 = self.model.blocks[4]
        self.block5 = self.model.blocks[5]
        self.block6 = self.model.blocks[6] # We won't be using this block, it's very expensive
        
        # Freeze Initial Layers
        #self.freeze(self.conv1)
        #self.freeze(self.bn1)
        #self.freeze(self.block0)
        #self.freeze(self.block1)
        #self.freeze(self.block2)
        #self.freeze(self.block3)
        #self.freeze(self.block4)
        #self.freeze(self.block5)
        
        # Custom Layers
        self.Attention1 = SqueezeExcite(40, 16)
        self.Attention2 = SqueezeExcite(112, 32)
        self.Attention3 = SqueezeExcite(160, 48)
        self.layer4 = nn.Sequential(*[
            ConvBlock(160, 320, 1, 0, 1)
        ] + [
            BottleNeck(320, 64, self.device, stochastic_depth = self.stochastic_depth) for i in range(5)
        ])
        
        self.Attention4 = SqueezeExcite(320, 96)
        self.layer5 = nn.Sequential(*[
            StridedConvBlock(320, 512, 1, 0, 1, 2)
        ] + [
            BottleNeck(512, 128, self.device, stochastic_depth = self.stochastic_depth) for i in range(3)
        ])
        
        self.global_avg = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(self.drop_prob)
        self.Linear = nn.Linear(512, self.num_classes)
    def forward(self, x):
        downsampled = self.downsampler(x)
        features0 = self.bn1(self.act1(self.conv1(downsampled)))
        block0 = self.block0(features0) # (16, 160)
        block1 = self.block1(block0) # (24, 80)
        block2 = self.block2(block1) # (40, 40)
        block2 = self.Attention1(block2)
        
        block3 = self.block3(block2) # (80, 20)
        block4 = self.block4(block3) # (112, 20)
        block4 = self.Attention2(block4)
        
        block5 = self.block5(block4) # (160, 10)
        block5 = self.Attention3(block5)
        
        layer4 = self.layer4(block5) 
        layer4 = self.Attention4(layer4) # (320, 10)
        
        layer5 = self.layer5(layer4) # (512, 5) 
        
        avg_pool = torch.squeeze(self.global_avg(layer5))
        avg_pool = self.dropout(avg_pool)
        return self.Linear(avg_pool)

In [None]:
class MobileSolverQTPi(nn.Module):
    def __init__(self, num_classes, device):
        super().__init__()
        self.out_size = IMAGE_SIZE
        self.num_classes = num_classes
        self.device = device
        self.dropout = 0.5
        self.stochastic_depth = 0.0
        self.model = MobileNetv3(self.out_size, self.num_classes, self.device, dropout = self.dropout, stochastic_depth = self.stochastic_depth)
        self.optim = Ranger(self.model.parameters(), lr = 1e-3, weight_decay = 1e-2)
        self.lr_decay = optim.lr_scheduler.StepLR(self.optim, 5, 0.95)
        self.lr_decay2 = optim.lr_scheduler.CosineAnnealingLR(self.optim, 5, eta_min = 1e-7)
        self.criterion = CustomLoss()
    def forward(self, x):
        self.eval()
        with torch.no_grad():
            return self.model(x)
    def threshold(self, pred):
        bools = pred >= 0.5
        pred[:, :] = 0
        pred[bools] = 1
        return pred
    def f_score(self, pred, y_true):
        '''
        Computes the f score given logits(not sigmoided
        '''
        pred = torch.squeeze(torch.sigmoid(pred))
        pred_labels = self.threshold(pred)
        return f1_score(y_true, pred_labels, average= 'weighted')
    def accuracy(self, pred, labels):
        pred = torch.squeeze(torch.sigmoid(pred))
        # Round to nearest
        bools = pred >= 0.5
        pred[:] = 0
        pred[bools] = 1
        B,C= pred.shape
        return torch.sum((pred == labels).int()) / B / C
    def training_loop(self, trainloader, valloader, NUM_EPOCHS, display_every = 16):
        liveloss = livelossplot.PlotLosses()
        bestValAcc = 0
        bestValLoss = 999
        torch.cuda.empty_cache()
        for EPOCH in range(NUM_EPOCHS):
            self.train()
            logs = {}
            logs['loss'] = 0
            logs['accuracy'] = 0
            logs['f_score'] = 0
            count = 0
            for images, labels in trainloader:
                self.optim.zero_grad()
                images = images.to(self.device)
                labels = labels.to(self.device)
                
                pred = self.model(images)
                loss = self.criterion(pred, labels)
                
                loss.backward()
                self.optim.step() 
                
                logs['loss'] += loss.item()
                accuracy = self.accuracy(pred, labels).item()
                logs['accuracy'] += accuracy
                
                logs['f_score'] += self.f_score(pred.cpu().detach(), labels.cpu())
                print(f"Step: {count}, L: {round(loss.item(), 3)}, A: {round(accuracy, 3)}")
                del images, labels
                del pred, loss, accuracy
                torch.cuda.empty_cache() 
                count += 1
                if count == display_every:
                    break
            logs['loss'] /= count
            logs['accuracy'] /= count
            logs['loss'] = round(logs['loss'], 3)
            logs['accuracy'] = round(logs['accuracy'], 3)
            logs['f_score'] /= count
            logs['f_score'] = round(logs['f_score'], 3)
            self.eval()
            self.lr_decay.step()
            self.lr_decay2.step()
            with torch.no_grad():
                logs['val_loss'] = 0
                logs['val_accuracy'] = 0
                logs['val_f_score'] = 0
                count = 0
                for images, labels in valloader:
                    images = images.to(self.device)
                    labels = labels.to(self.device)
                    
                    pred = self.model(images)
                    loss = self.criterion(pred, labels).item()
                    accuracy = self.accuracy(pred, labels).item()
                    
                    logs['val_loss'] += loss
                    logs['val_accuracy'] += accuracy
                    logs['val_f_score'] += self.f_score(pred.cpu(), labels.cpu())
                    count += 1
                    del images, labels
                    del pred, loss, accuracy
                    torch.cuda.empty_cache()
                logs['val_loss'] /= count
                logs['val_accuracy'] /= count
                logs['val_loss'] = round(logs['val_loss'], 3)
                logs['val_accuracy'] = round(logs['val_accuracy'], 3)
                logs['val_f_score'] /= count
                logs['val_f_score'] = round(logs['val_f_score'], 3)

            liveloss.update(logs)
            liveloss.send()
            print(f"E: {EPOCH}, L: {logs['loss']}, A: {logs['accuracy']}, F: {logs['f_score']} VL: {logs['val_loss']}, VA: {logs['val_accuracy']} VF: {logs['val_f_score']}")
            if logs['val_loss'] <= bestValLoss:
                bestValLoss= logs['val_loss']
                torch.save(self.state_dict(), "./BestValLoss.pth")
            if logs['val_accuracy'] >= bestValAcc:
                bestValAcc = logs['val_accuracy']
                torch.save(self.state_dict(), "./BestValAcc.pth")
        torch.save(self.state_dict(), "./FinalModel.pth")

# Stage 2: Modified ResNet Blocks with MultiFold Training

In [None]:
class ModifiedEffNetQT(nn.Module):
    '''
    EfficientNet B4 Variant of the Same Model
    '''
    def freeze(self, layer):
        for parameter in layer.parameters():
            parameter.requires_grad = False
    def unfreeze(self, layer):
        for parameter in layer.parameters():
            parameter.requires_grad = True
    def __init__(self, out_size, num_classes, device, dropout = 0.2, stochastic_depth = 0.2):
        super().__init__()
        self.model_name = "tf_efficientnet_b4_ns"
        self.model = timm.create_model(self.model_name, pretrained = True)
        
        self.out_size = out_size
        self.num_classes = num_classes
        self.device = device
        self.drop_prob = dropout
        self.stochastic_depth = stochastic_depth
        
        self.downsampler = CNNDownSampler(self.out_size, self.device)
        
        self.conv1 = self.model.conv_stem
        self.bn1 = self.model.bn1
        self.act1 = self.model.act1
    
        self.block0 = self.model.blocks[0]
        self.block1 = self.model.blocks[1]
        self.block2 = self.model.blocks[2]
        self.block3 = self.model.blocks[3]
        self.block4 = self.model.blocks[4]
        self.block5 = self.model.blocks[5]
        self.block6 = self.model.blocks[6]
       
        # Freeze Values(Uncomment if you replace self.downsampler with nn.Identity)
        #self.freeze(self.conv1)
        #self.freeze(self.bn1)
        #self.freeze(self.block0)
        #self.freeze(self.block1)
        #self.freeze(self.block2)
        
        self.Attention1 = SqueezeExcite(56, 16)
        self.Attention2 = SqueezeExcite(160, 32)
        self.Attention3 = SqueezeExcite(448, 128)
        
        self.layer5 = nn.Sequential(*[
            StridedConvBlock(448, 768, 1, 0, 1, 2)
        ] + [
           InvertedBottleNeck(768, 1536, self.device, stochastic_depth = self.stochastic_depth) for i in range(3)
        ])
        self.proj = ConvBlock(768, 2048, 1, 0, 1)
        
        self.global_avg = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(self.drop_prob)
        self.Linear = nn.Linear(2048, self.num_classes)
    def forward(self, x):
        downsampled = self.downsampler(x)
        features0 = self.bn1(self.act1(self.conv1(downsampled))) # (B, 48, 160, 160)
        
        block0 = self.block0(features0)
        block1 = self.block1(block0)
        block2 = self.block2(block1) # (B, 56, 40, 40)
        # Attention1
        block2 = self.Attention1(block2)
        
        block3 = self.block3(block2)
        block4 = self.block4(block3) # (B, 160, 20, 20)
        # Attention2
        block4 = self.Attention2(block4)
        
        block5 = self.block5(block4)
        block6 = self.block6(block5)
        # attention 3
        block6 = self.Attention3(block6) # (B, 448, 10, 10)
        
        layer5 = self.layer5(block6)
        layer5 = self.proj(layer5)
        global_avg = torch.squeeze(self.global_avg(layer5))
        dropped = self.dropout(global_avg)
        return self.Linear(dropped)
        

In [None]:
class ModifiedResNetQT(nn.Module):
    '''
    Modified ResNet200D
    '''
    def freeze(self, layer):
        for parameter in layer.parameters():
            parameter.requires_grad = False
    def unfreeze(self, layer):
        for parameter in layer.parameters():
            parameter.requires_grad = True
    def __init__(self, out_size, num_classes, device, dropout = 0.2, stochastic_depth = 0.2):
        super().__init__()
        self.device = device
        self.num_classes = num_classes
        self.drop_prob = dropout 
        self.out_size = out_size
        self.device = device
        self.stochastic_depth = stochastic_depth
        self.model_name = 'resnet101d'
        self.model = timm.create_model(self.model_name, pretrained = True)
        
        self.downsampler = CNNDownSampler(self.out_size, self.device) 
        
        self.conv1 = self.model.conv1
        self.bn1 = self.model.bn1
        self.act1 = self.model.act1
        self.pool = self.model.maxpool
        
        self.layer1 = self.model.layer1
        self.layer2 = self.model.layer2
        self.layer3 = self.model.layer3
        self.layer4 = self.model.layer4 # we won't use this layer.
        
        # Freeze Initial Layers
        #self.freeze(self.conv1)
        #self.freeze(self.bn1)
        #self.freeze(self.layer1)
        #self.freeze(self.layer2)
        
        # Custom Layer
        self.Attention2 = SqueezeExcite(512, 128)
        self.Attention3 = SqueezeExcite(1024, 256)
        self.Attention4 = SqueezeExcite(1536, 512)
        
        self.layer4 = nn.Sequential(*[
            StridedConvBlock(1024, 1536, 1, 0, 1, 2)
        ] + [
            BottleNeck(1536, 512, self.device, stochastic_depth = self.stochastic_depth) for i in range(5)
        ])
        self.layer5 = nn.Sequential(*[
            StridedConvBlock(1536, 2048, 1, 0, 1, 2)
        ] + [
            BottleNeck(2048, 512, self.device, stochastic_depth = self.stochastic_depth) for i in range(2)
        ])
        self.global_avg = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(self.drop_prob)
        self.Linear = nn.Linear(2048, self.num_classes)
    def forward(self, x):
        '''
        x: Tensor(B, 3, H, W)
        '''
        downsampled = self.downsampler(x) # (B, 3, out_size, out_size)
        features0 = self.pool(self.bn1(self.act1(self.conv1(downsampled)))) # (B, 64, 160, 160)
        
        layer1 = self.layer1(features0)
        layer2 = self.layer2(layer1)
        # Attention2
        layer2 = self.Attention2(layer2)
        
        layer3 = self.layer3(layer2)
        # Attention 3
        layer3 = self.Attention3(layer3)
        
        layer4 = self.layer4(layer3)
        layer4 = self.Attention4(layer4)
        
        layer5 = self.layer5(layer4)
        
        global_avg = torch.squeeze(self.global_avg(layer5))
        dropped = self.dropout(global_avg)
        return self.Linear(dropped)

In [None]:
class MultiFoldQTPi(nn.Module):
    def __init__(self, num_classes, device):
        super().__init__()
        self.out_size = IMAGE_SIZE
        self.num_classes = num_classes
        self.device = device
        self.dropout = 0.5
        self.stochastic_depth = 0.0
        self.model = ModifiedEffNetQT(self.out_size, self.num_classes, self.device, dropout = self.dropout, stochastic_depth = self.stochastic_depth)
        self.optim = Ranger(self.model.parameters(), lr = 1e-3, weight_decay = 1e-2)
        self.lr_decay = optim.lr_scheduler.StepLR(self.optim, 5, 0.95)
        self.lr_decay2 = optim.lr_scheduler.CosineAnnealingLR(self.optim, 5, eta_min = 1e-7)
        self.criterion = CustomLoss()
    def forward(self, x):
        self.eval()
        with torch.no_grad():
            return self.model(x)
    def threshold(self, pred):
        bools = pred >= 0.5
        pred[:, :] = 0
        pred[bools] = 1
        return pred
    def f_score(self, pred, y_true):
        '''
        Computes the f score given logits(not sigmoided
        '''
        pred = torch.squeeze(torch.sigmoid(pred))
        pred_labels = self.threshold(pred)
        return f1_score(y_true, pred_labels, average= 'weighted')
    def accuracy(self, pred, labels):
        pred = torch.squeeze(torch.sigmoid(pred))
        # Round to nearest
        bools = pred >= 0.5
        pred[:] = 0
        pred[bools] = 1
        B,C= pred.shape
        return torch.sum((pred == labels).int()) / B / C
    def training_loop(self, trainloader, valloader, NUM_EPOCHS, fold_num = 0, display_every = 16):
        liveloss = livelossplot.PlotLosses()
        bestValAcc = 0
        bestValLoss = 999
        bestValF = 0
        torch.cuda.empty_cache()
        for EPOCH in range(NUM_EPOCHS):
            self.train()
            logs = {}
            logs['loss'] = 0
            logs['accuracy'] = 0
            logs['f_score'] = 0
            count = 0
            for images, labels in trainloader:
                self.optim.zero_grad()
                images = images.to(self.device)
                labels = labels.to(self.device)
                
                pred = self.model(images)
                loss = self.criterion(pred, labels)
                
                loss.backward()
                self.optim.step() 
                
                logs['loss'] += loss.item()
                accuracy = self.accuracy(pred, labels).item()
                logs['accuracy'] += accuracy
                
                logs['f_score'] += self.f_score(pred.cpu().detach(), labels.cpu())
                print(f"Step: {count}, L: {round(loss.item(), 3)}, A: {round(accuracy, 3)}")
                del images, labels
                del pred, loss, accuracy
                torch.cuda.empty_cache() 
                count += 1
                if count == display_every:
                    break
            logs['loss'] /= count
            logs['accuracy'] /= count
            logs['loss'] = round(logs['loss'], 3)
            logs['accuracy'] = round(logs['accuracy'], 3)
            logs['f_score'] /= count
            logs['f_score'] = round(logs['f_score'], 3)
            self.eval()
            self.lr_decay.step()
            self.lr_decay2.step()
            with torch.no_grad():
                logs['val_loss'] = 0
                logs['val_accuracy'] = 0
                logs['val_f_score'] = 0
                count = 0
                for images, labels in valloader:
                    images = images.to(self.device)
                    labels = labels.to(self.device)
                    
                    pred = self.model(images)
                    loss = self.criterion(pred, labels).item()
                    accuracy = self.accuracy(pred, labels).item()
                    
                    logs['val_loss'] += loss
                    logs['val_accuracy'] += accuracy
                    logs['val_f_score'] += self.f_score(pred.cpu(), labels.cpu())
                    count += 1
                    del images, labels
                    del pred, loss, accuracy
                    torch.cuda.empty_cache()
                logs['val_loss'] /= count
                logs['val_accuracy'] /= count
                logs['val_loss'] = round(logs['val_loss'], 3)
                logs['val_accuracy'] = round(logs['val_accuracy'], 3)
                logs['val_f_score'] /= count
                logs['val_f_score'] = round(logs['val_f_score'], 3)

            liveloss.update(logs)
            liveloss.send()
            
            if logs['val_f_score'] >= bestValF:
                bestValF = logs['val_f_score']
                torch.save(self.state_dict(), f"./fold_{fold_num}_F.pth")
            if logs['val_loss'] <= bestValLoss:
                bestValLoss= logs['val_loss']
                #torch.save(self.state_dict(), f"./fold_{fold_num}_Loss.pth")
            if logs['val_accuracy'] >= bestValAcc:
                bestValAcc = logs['val_accuracy']
                #torch.save(self.state_dict(), f"./fold_{fold_num}_Acc.pth")
            print(f"E: {EPOCH}, BF: {bestValF} BA: {bestValAcc} BL: {bestValLoss} L: {logs['loss']}, A: {logs['accuracy']}, F: {logs['f_score']} VL: {logs['val_loss']}, VA: {logs['val_accuracy']} VF: {logs['val_f_score']}")

        torch.save(self.state_dict(), f"./fold_{fold_num}_Final.pth")

In [None]:
def train_folds(KSPLITS, NUM_EPOCHS, display_every = 16, load_prev = None):
    '''
    Trains through all folds, saving BestValLoss and Final Model Only
    load_prev: Resume Training all folds. Splits should be identical due to seeding.
    '''
    for idx, (train, val) in enumerate(KSPLITS):
        # Create Datasets
        train_dataset = PlantDataset(train, copy.deepcopy(CLASSES), BASE_PATH, train_transforms)
        val_dataset = PlantDataset(val, copy.deepcopy(CLASSES), BASE_PATH, test_transforms)
        # Create Dataloaders
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True, worker_init_fn = seed_worker)
        val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size = TEST_BATCH_SIZE, shuffle = False, worker_init_fn = seed_worker)
        # Create Model
        model = MultiFoldQTPi(len(CLASSES), device)
        model.to(device)
        if load_prev != None:
            # Load in Prev Fold Values
            path = load_prev + f'fold_{idx}_Final.pth'
            model.load_state_dict(torch.load(path, map_location = device))
            
        # Train Fold
        model.training_loop(train_dataloader, val_dataloader, NUM_EPOCHS, fold_num = idx, display_every = display_every)
        

In [None]:
NUM_EPOCHS = 50
DISPLAY_EVERY = 64 # Train each fold at each commit(Fully Train Each One Per Commit, Splits are identical each time with seeding)
train_folds([KSPLITS[2]], NUM_EPOCHS, display_every = DISPLAY_EVERY, load_prev = '../input/halftrained/')