### Classification notebook for Kaggle RSNA competition

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold,StratifiedGroupKFold
import warnings
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import timm
from sklearn.preprocessing import minmax_scale
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2,torchvision
from ipyexperiments.ipyexperiments import IPyExperimentsPytorch
from timm.optim.optim_factory import create_optimizer_v2
from timm import utils
from fastprogress.fastprogress import format_time
from fastai.vision.all import *
from torch.utils.data import WeightedRandomSampler
from sklearn.metrics import roc_auc_score

class CFG:
    seed = 46
    n_splits = 4
    SZ = 1024
    debug = False
    BS = 32
    EP = 12
    MODEL = 'tf_efficientnet_b4_ns'
    LR = 4e-04
    WD = 1e-08
    debug = False

random.seed(CFG.seed)
os.environ["PYTHONHASHSEED"] = str(CFG.seed)
np.random.seed(CFG.seed)
plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

In [None]:
set_seed(CFG.seed)

In [None]:
root_dir = '///mnt/c/Personal/Competitions/Kaggle/rsna'
image_dir = f'{root_dir}/data/8bit'
DIR = '///mnt/c/Personal/Competitions/Kaggle/rsna/data/'
submit = pd.read_csv(os.path.join(DIR,'sample_submission.csv'))
train = pd.read_csv(os.path.join(DIR,'Train.csv'))
test_df = pd.read_csv(os.path.join(DIR,'Test.csv'))

if CFG.debug:
    train = train.sample(frac=0.01).reset_index(drop=True)
    
VERSION = "Baseline"
MODEL_FOLDER = Path(f"{root_dir}/runs/{VERSION}/")
os.makedirs(MODEL_FOLDER,exist_ok=True)
KERNEL_TYPE = f"{CFG.MODEL}_{CFG.SZ}_bs{CFG.BS}_ep{CFG.EP}_lr{str(CFG.LR).replace('-','')}_wd{str(CFG.WD).replace('-','')}"

print(MODEL_FOLDER)
print(KERNEL_TYPE)

### Get kfolds

In [None]:
mskf = StratifiedGroupKFold(n_splits=CFG.n_splits, shuffle=True, random_state=121)
fold_ids = []
train['fold'] = 0

for train_index, test_index in mskf.split(train,train['cancer'].values,train['patient_id'].values):
    fold_ids.append(test_index)    

for fld in range(CFG.n_splits):
    valIx = fold_ids[fld]
    train.loc[valIx,'fold']=fld 

#### Data loader

In [None]:
def read_data(d):
    image =  Image.open(os.path.join(image_dir,f'{d.patient_id}_{d.image_id}.png')).convert('RGB')
    return image

class RsnaDataset(Dataset):
    def __init__(self, df, augs=None,mode='train'):
        df.loc[:, 'i'] = np.arange(len(df))
        self.length = len(df)
        self.df = df
        self.augs = augs
        self.mode = mode
        
    def __len__(self):
        return self.length

    def __getitem__(self, index):
        d = self.df.iloc[index]
        image = read_data(d)
        image = np.array(image)
        if self.augs is not None:
            image = self.augs(image=image)['image']
        patient_id = d.patient_id
        cancer = torch.tensor(d.cancer).float()
        if self.mode=='test':
            return image,patient_id
        
        return image,cancer,patient_id

In [None]:
def worker_init_fn(worker_id):
    """
    Handles PyTorch x Numpy seeding issues.
    Args:
        worker_id (int): Id of the worker.
    """
    np.random.seed(np.random.get_state()[1][0] + worker_id)

### Augmentations

In [None]:
TRAIN_AUG = A.Compose([
    A.ShiftScaleRotate(rotate_limit=15, p=0.5),
    A.HorizontalFlip(p = 0.5),
    A.Resize(CFG.SZ,CFG.SZ),
    A.Normalize(mean=0,std=1),
    ToTensorV2(),
])

VALID_AUG = A.Compose([
    A.Resize(CFG.SZ,CFG.SZ),
    A.Normalize(mean=0,std=1),
    ToTensorV2(),
])

### Visualization

In [None]:
dataset_show = RsnaDataset(train, augs=TRAIN_AUG, mode='train')
loader_show = torch.utils.data.DataLoader(dataset_show, batch_size=6)
img,target,_ = next(iter(loader_show))

grid = torchvision.utils.make_grid(img, normalize=True, padding=2)
grid = grid.permute(1, 2, 0)
show_image(grid, figsize=(15,8),title=[x for x in target.numpy()]);

### Model

In [None]:
def get_rsna_classification_model(model_name, pretrained=True, **kwargs):
    model = timm.create_model(CFG.MODEL, pretrained=pretrained,num_classes=1, **kwargs)
    return model

### QC

In [None]:
dl = DataLoader(RsnaDataset(train, augs=TRAIN_AUG, mode='train'),
                          batch_size=2,
                          shuffle=True,
                          num_workers=8,
                          drop_last=True,
                        worker_init_fn=worker_init_fn)

a,b,_ = next(iter(dl))
a.shape,b.shape

In [None]:
m = get_rsna_classification_model(CFG.MODEL)
out = m(a)
print(out, out.shape)

### Train & Validation Function

In [None]:
def pfbeta(labels, preds, beta=1,clip=True):
    if clip:
        preds = preds.clip(0, 1)
    y_true_count = labels.sum()
    ctp = preds[labels==1].sum()
    cfp = preds[labels==0].sum()
    beta_squared = beta * beta
    c_precision = ctp / (ctp + cfp)
    c_recall = ctp / y_true_count
    if (c_precision > 0 and c_recall > 0):
        result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
        return result
    else:
        return torch.tensor(0.0)
    
def pfbeta_thresh(labels, preds, beta=1):
    preds = preds>0.2
    y_true_count = labels.sum()
    ctp = preds[labels==1].sum()
    cfp = preds[labels==0].sum()
    beta_squared = beta * beta
    c_precision = ctp / (ctp + cfp)
    c_recall = ctp / y_true_count
    if (c_precision > 0 and c_recall > 0):
        result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
        return result
    else:
        return torch.tensor(0.0)
    
def optimal_f1(labels, predictions):
    labels = labels.cpu().numpy()
    predictions = predictions.cpu().numpy()
    thres = np.linspace(0, 1, 100)
    f1s = [pfbeta(labels, predictions > thr,clip=False) for thr in thres]
    idx = np.argmax(f1s)
    return f1s[idx], thres[idx]

In [None]:
def train_one_epoch(
    model: nn.Module,
    loader: Iterable,
    loss_fn: Callable,
    optimizer: torch.optim.Optimizer,
    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
    mixup_fn: Callable = None,
    grad_scaler: torch.cuda.amp.GradScaler = None,
    mbar: master_bar = None,
):

    model.train()

    losses_m = utils.AverageMeter()

    pbar = progress_bar(loader, parent=mbar, leave=False)
    pbar.update(0)

    for batch_idx, (input, target,_) in enumerate(loader):
        input, target = input.cuda(), target.cuda()
        
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=True):
            output = model(input)
            loss = loss_fn(output, target.reshape(-1,1))
            
        losses_m.update(loss.item(), input.size(0))

        grad_scaler.scale(loss).backward()
        grad_scaler.step(optimizer)
        grad_scaler.update()

        if lr_scheduler is not None:
            lr_scheduler.step()

        pbar.update(batch_idx + 1)
        pbar.comment = f"{losses_m.avg:.4f}"

    pbar.on_iter_end()
    return OrderedDict([("loss", losses_m.avg)])


@torch.inference_mode()
def validate(model: nn.Module, loader: Iterable, loss_fn: Callable, mbar: master_bar):
    model.eval()

    metric_m = utils.AverageMeter()
    metric_m_thresh = utils.AverageMeter()
    auc_m = utils.AverageMeter()
    losses_m = utils.AverageMeter()

    pbar = progress_bar(loader, parent=mbar, leave=False)
    pbar.update(0)

    for batch_idx, (input, target,_) in enumerate(loader):
        
        input, target = input.cuda(), target.cuda()
        output = torch.round(model(input))

        loss = loss_fn(output, target.reshape(-1,1)).item()
        losses_m.update(loss, input.size(0))
        
        output = F.sigmoid(output)
        metric = pfbeta(target,output).item()
        metric_thresh,_ = optimal_f1(target, output)# pfbeta_thresh(target,output).item()
        metric_m.update(metric, output.size(0))
        metric_m_thresh.update(metric_thresh.item(), output.size(0))
        pbar.update(batch_idx + 1)

    pbar.on_iter_end()
    return OrderedDict([("loss", losses_m.avg), ("metric", metric_m.avg),("metric_thresh", metric_m_thresh.avg)])


### Sampler

In [None]:
class BalanceSampler(Sampler):
    
    def __init__(self, dataset, ratio=8):
        self.r = ratio-1
        self.dataset = dataset
        self.pos_index = np.where(dataset.df.cancer>0)[0]
        self.neg_index = np.where(dataset.df.cancer==0)[0]

        self.length = self.r*int(np.floor(len(self.neg_index)/self.r))

    def __iter__(self):
        pos_index = self.pos_index.copy()
        neg_index = self.neg_index.copy()
        np.random.shuffle(pos_index)
        np.random.shuffle(neg_index)

        neg_index = neg_index[:self.length].reshape(-1,self.r)
        pos_index = np.random.choice(pos_index, self.length//self.r).reshape(-1,1)

        index = np.concatenate([pos_index,neg_index],-1).reshape(-1)
        return iter(index)

    def __len__(self):
        return self.length

### Run!

In [None]:
def training_loop(fold):
    
    with IPyExperimentsPytorch(exp_enable=False, cl_set_seed=42, cl_compact=True):
        print()
        print("*" * 100)
        print(f"Training fold {fold}")
        print("*" * 100)

        torch.backends.cudnn.benchmark = True
      
        dataset_train = RsnaDataset(train.query("fold!=@fold").reset_index(drop=True), augs=TRAIN_AUG, mode="train")
        dataset_valid = RsnaDataset(train.query("fold==@fold").reset_index(drop=True), augs=VALID_AUG, mode="valid")

        print(f"TRAIN: {len(dataset_train)} | VALID: {len(dataset_valid)}")

        loader_train = torch.utils.data.DataLoader(dataset_train, 
                                                   CFG.BS, 
                                                   num_workers=8, 
                                                   drop_last=True,
                                                  pin_memory=True)#,
#                                                    sampler=ImbalancedDatasetSampler(dataset_train))
        loader_valid = torch.utils.data.DataLoader(dataset_valid, CFG.BS * 2, num_workers=8, shuffle=False)

        model = get_rsna_classification_model(CFG.MODEL, pretrained=True)#, drop_path_rate=0.15)
        model.cuda()
        optimizer = create_optimizer_v2(model, "lookahead_RAdam", lr=CFG.LR)

        num_train_steps = len(loader_train) * CFG.EP
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_train_steps)

        train_loss_fn = nn.BCEWithLogitsLoss()
        valid_loss_fn = nn.BCEWithLogitsLoss()

        grad_scaler = torch.cuda.amp.GradScaler()

        print(f"Scheduled epochs: {CFG.EP}")

        mbar = master_bar(list(range(CFG.EP)))
        best_epoch, best_metric = 0, 100
        metric_names = ["epoch", "train_loss", "valid_loss", "metric","metric_thresh", "time"]
        mbar.write([f"{l:.6f}" if isinstance(l, float) else str(l) for l in metric_names], table=True)

        for epoch in range(CFG.EP):
            start_time = time.time()
            mbar.update(epoch)

            train_metrics = train_one_epoch(
                model, loader_train, train_loss_fn, optimizer,
                lr_scheduler=lr_scheduler, mixup_fn=None, grad_scaler=grad_scaler, mbar=mbar)

            valid_metrics = validate(model, loader_valid, valid_loss_fn, mbar=mbar)
            
            elapsed = format_time(time.time() - start_time)
            epoch_log = [epoch,train_metrics["loss"], valid_metrics["loss"], valid_metrics["metric"],
                         valid_metrics["metric_thresh"], elapsed]
            mbar.write([f"{l:.6f}" if isinstance(l, float) else str(l) for l in epoch_log], table=True)

            if 1:
                best_epoch, best_metric = epoch, valid_metrics["loss"]
                path = Path(f'{MODEL_FOLDER}/fold_{fold}')
                os.makedirs(path,exist_ok=True)
                dirpath = path / (KERNEL_TYPE + f"_Epoch_{epoch}_fold_{fold}.pth")
                torch.save(model.state_dict(), dirpath)
                    
        mbar.on_iter_end()
        print("*** Best metric: {0} (epoch {1})".format(best_metric, best_epoch))

### Train 4 folds

In [None]:
for fold_idx in [0,1,2,3]:
    training_loop(fold_idx)

In [None]:
for i in range(5):
    torch.cuda.empty_cache()
    gc.collect()

### OOF Generation and Scoring

In [None]:
def gen_oof(fold):
   
    torch.backends.cudnn.benchmark = True
    dataset_valid = RsnaDataset(train.query("fold==@fold").reset_index(drop=True), augs=VALID_AUG, mode="valid")
    print(f"VALID: {len(dataset_valid)}")

    loader_valid = torch.utils.data.DataLoader(dataset_valid, CFG.BS * 2, num_workers=8, shuffle=False)
    model = get_rsna_classification_model(CFG.MODEL, pretrained=False)
    model.load_state_dict(torch.load(f'{MODEL_FOLDER}/fold_{fold}/{KERNEL_TYPE}_Epoch_{CFG.EP-1}_fold_{fold}.pth'))
    model.eval()
    model.cuda()

    preds = []
    imageids = []

    for input,label,patient_id in tqdm(loader_valid, dynamic_ncols=True, desc="OOF Generation"):
        pred = []
        with torch.cuda.amp.autocast(), torch.no_grad():
            input = input.cuda()
            pred.append(F.sigmoid(model(input)))
            torch.cuda.empty_cache()
            gc.collect()
        preds.append(torch.concat(pred).data.cpu().numpy())
    return np.concatenate(preds, axis=0)

In [None]:
folds = [0,1,2,3]
oof = np.zeros((len(train)))
for k in tqdm(folds):
    oof_fold,ix = gen_oof(k)
    print(oof_fold.min(),oof_fold.max())
    oof[ix] += oof_fold

In [None]:
def optimal_f1_numpy(oof,fold):
    labels = train.loc[train['fold'].isin(fold)].reset_index(drop=True)['cancer'].values
    oof = oof[train.loc[train['fold'].isin(fold)].index]
    thres = np.linspace(0, 1, 100)
    f1s = [pfbeta(labels, oof > thr,clip=False) for thr in thres]
    idx = np.argmax(f1s)
    return f1s[idx], thres[idx]

In [None]:
scr, thresh = optimal_f1_numpy(oof,folds)

In [None]:
scr,thresh

### Fin 