#### Code to train models

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

## Imports

In [None]:
import os
import cv2
import ast
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from tqdm.notebook import tqdm

In [None]:
from params import *

In [None]:
from data.preparation import prepare_dataframe, handle_duplicates, add_additional_boxes
from data.dataset import CovidDetDataset, CovidClsDataset
from data.transforms import get_transfos_det, get_transfos_cls

from model_zoo.models import get_model
from model_zoo.encoders import get_encoder

from utils.plot import plot_sample
from utils.boxes import treat_boxes
from utils.logger import prepare_log_folder, save_config, create_logger, update_overall_logs

from training.main import k_fold

## Data

### Load

In [None]:
df = prepare_dataframe().copy()

In [None]:
clusts = np.load("../output/clusts.npy", allow_pickle=True)
found = np.load("../output/found.npy")
transpositions = np.load("../output/transpositions.npy", allow_pickle=True)

df = handle_duplicates(df, clusts, transpositions, plot=False)

In [None]:
df = add_additional_boxes(df)

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(x="label", hue="study_label", data=df)
plt.show()

plt.figure(figsize=(15, 5))
sns.countplot(x="study_label", hue="label", data=df)
plt.show()

### PL

In [None]:
df_test = pd.read_csv('../input/df_test_512.csv')

In [None]:
df_test['shape'] = df_test['shape'].apply(lambda x: np.array(x[1:-1].split(', ')).astype(int))
df_test['shape_crop'] = df_test['shape_crop'].apply(lambda x: np.array(x[1:-1].split(', ')).astype(int))
df_test['crop_starts'] = df_test['crop_starts'].apply(lambda x: np.array(x[1:-1].split(', ')).astype(int))

In [None]:
pl_study = pd.read_csv('../output/sub_0931_study.csv')
pl_study["study_id"] = pl_study["id"].apply(lambda x: x.split('_')[0])
pl_study.drop(['id', 'PredictionString'], axis=1, inplace=True)

In [None]:
pl_img = pd.read_csv('../output/sub_0931_img.csv')
pl_img["image_id"] = pl_img["id"].apply(lambda x: x.split('_')[0])
pl_img.drop(['id', 'PredictionString'], axis=1, inplace=True)

In [None]:
df_test = df_test.merge(pl_study, how="left", on="study_id")
df_test = df_test.merge(pl_img, how="left", on="image_id")

In [None]:
df['is_pl'] = 0
df_test['is_pl'] = 1

df['root'] = DATA_PATH + f"train_{SIZE}/"
df_test['root'] = DATA_PATH + f"test_{SIZE}/"

In [None]:
df_test['kfold'] = -1
df_test['img_target'] = 1 - df_test['none']

## Extra

In [None]:
df_bim = pd.read_csv(DATA_PATH + "meta_bim.csv")
df_covidx = pd.read_csv(DATA_PATH + "meta_covidx.csv")
df_ricord = pd.read_csv(DATA_PATH + "meta_ricord.csv")

In [None]:
df_bim["dataset"] = "bim"
df_covidx["dataset"] = "covidx"
df_ricord["dataset"] = "ricord"

In [None]:
df_covidx['target'] = (df_covidx['label'] == "positive").astype(int)
df_ricord['target'] = 1 - df_ricord['negative']

In [None]:
df_ext = pd.concat([df_bim, df_covidx, df_ricord]).reset_index(drop=True)

In [None]:
df_ext['unk_label'] = df_ext['unk_label'].fillna(False)
df_ext = df_ext.dropna(axis=1)

In [None]:
PL_FOLDERS = [
    LOG_PATH + "2021-07-30/4/",
    LOG_PATH + "2021-07-31/0/",
]

for i in range(5):
    pl_img = (
        0.5 * np.load(PL_FOLDERS[0] + f"preds_ext_img_flip_{i}.npy") +
        0.5 * np.load(PL_FOLDERS[1] + f"preds_ext_img_flip_{i}.npy")
    )
    pl_study = (
        0.5 * np.load(PL_FOLDERS[0] + f"preds_ext_study_flip_{i}.npy") +
        0.5 * np.load(PL_FOLDERS[1] + f"preds_ext_study_flip_{i}.npy")
    )
    df_ext[f'pl_img_{i}'] = pl_img
    for j, c in enumerate(CLASSES):
        df_ext[f'pl_{c}_{i}'] = pl_study[:, j]
        
df_ext['pl_img'] = np.mean([df_ext[f'pl_img_{i}'] for i in range(5)], 0)
for c in CLASSES:
    df_ext[f'pl_{c}'] = np.mean([df_ext[f'pl_{c}_{i}'] for i in range(5)], 0)

In [None]:
df_ext = df_ext[~df_ext["unk_label"]].reset_index(drop=True)

In [None]:
T = 0.5

In [None]:
df_ext = df_ext[((df_ext['pl_img'] > T) & (df_ext['target'] == 1)) | (df_ext['target'] == 0)].reset_index(drop=True)
df_ext = df_ext[((df_ext['pl_img'] < 1 - T) & (df_ext['target'] == 0)) | (df_ext['target'] == 1)].reset_index(drop=True)

In [None]:
plt.figure(figsize=(15, 8))
sns.boxplot(y='pl_img', x='target', hue='dataset', data=df_ext)
plt.show()

In [None]:
# df_ext = df_ext[((df_ext['pl_negative'] > T) & (df_ext['target'] == 0)) | (df_ext['target'] == 1)].reset_index(drop=True)
# df_ext = df_ext[((df_ext['pl_negative'] < 1 - T) & (df_ext['target'] == 1)) | (df_ext['target'] == 0)].reset_index(drop=True)

# df_ext = df_ext[((df_ext['pl_typical'] > T) & (df_ext['target'] == 1)) | (df_ext['target'] == 0)].reset_index(drop=True)
# df_ext = df_ext[((df_ext['pl_typical'] < 1 - T) & (df_ext['target'] == 0)) | (df_ext['target'] == 1)].reset_index(drop=True)

In [None]:
plt.figure(figsize=(15, 10))
for i, c in enumerate(CLASSES):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=f'pl_{c}', x='target', hue='dataset', data=df_ext)
plt.show()

In [None]:
len(df_ext)

### Dataset

In [None]:
df_ = df.head(10)

In [None]:
transforms = get_transfos_cls(augment=False)

In [None]:
dataset = CovidClsDataset(df_, df_ext, transforms=transforms, train=True, extra_prop=0.5)

In [None]:
dataset.study_targets_ext.max(0, keepdims=True)

In [None]:
# for i in np.random.choice(len(dataset), 10):
for i in range(10):
    img, mask, y, y_img, y_aux, is_pl = dataset[i]
    
    if isinstance(img, torch.Tensor):
        img = img.cpu().numpy().transpose(1, 2, 0)
        mask = mask.cpu().numpy()[:, :, None]

    if mask.max():
        img = (img * mask + 0.8 * img * (1 - mask)).astype(int)
    plt.figure(figsize=(9, 9))
    plot_sample(img, [], bbox_format="yolo")
    plt.title(
        f'Study target : {y} - '
        f'Img target : {y_img} - aux target : {y_aux} - is_pl {is_pl}'
    )
    plt.show()

## Model

In [None]:
# import timm

# class Covid_Net(nn.Module):
#     def __init__(self,model_name,pretrained,out_features):
#         super(Covid_Net, self).__init__()
#         self.net = timm.create_model(model_name,pretrained=pretrained)
        
#         in_features = self.net.classifier.in_features
#         self.net.global_pool = nn.Identity()
#         self.net.classifier = nn.Identity()
        
#         self.logit = nn.Linear(in_features,out_features)
        
#     def forward(self, image):
#         #b,c,h,w
#         batch_size = len(image)
#         x = self.net(image)
#         x = F.adaptive_avg_pool2d(x,1).reshape(batch_size,-1)
#         logit = self.logit(x)
        
#         return logit
    
# model_params = {
#     'model_name':'tf_efficientnetv2_m_in21ft1k',
#     'pretrained':True,
#     'out_features':1
# }

# model = Covid_Net(**model_params)


# torch.save(model.net.state_dict(), "../output/pretrained_tf_efficientnetv2_m_in21ft1k")

In [None]:
# model = get_encoder('tf_efficientnet_b4_ns')

In [None]:
model = get_model('tf_efficientnetv2_m_in21ft1k', num_classes=4)

In [None]:
transforms = get_transfos_cls(augment=False)
dataset = CovidClsDataset(df, df_ext, transforms=transforms)

x, m, y, y_img, y_aux, is_pl = dataset[0]

plt.figure(figsize=(9, 9))
plt.subplot(1, 2, 1)
plt.imshow(x.cpu().numpy().transpose(1, 2, 0))
plt.axis(False)
plt.subplot(1, 2, 2)
plt.imshow(m.cpu().numpy()[:, :, None])
plt.axis(False)

x = x.unsqueeze(0).float()
m = m.unsqueeze(0).float()
y = y.unsqueeze(0)
y_aux = y_aux.unsqueeze(0)
y_img = y_img.unsqueeze(0)
is_pl = is_pl.unsqueeze(0)

In [None]:
pred = model(x)

In [None]:
for p in pred:
    try:
        print(p.size())
    except:
        for p_ in p:
            print(' ', p_.size())

In [None]:
from training.losses import *

In [None]:
loss = CovidLoss({  
        "w_bce": 0.75,
        "w_seg_loss": 0.95,
        "seg_loss_multiplier": 4,
        "w_study": 2,
        "w_img": 1,
        "pl_cls_w": 0.1,
        "w_aux_loss": 1,
    })

In [None]:
loss(pred[0], pred[1], pred[2], pred[3], [y, y], [y_img, y_img], y_aux, m, is_pl, mix_lambda=0.5)

### Training

In [None]:
BATCH_SIZES = {
    "resnext50_32x4d": 16,
    'tf_efficientnetv2_s_in21ft1k': 8, # 16
    'tf_efficientnetv2_m_in21ft1k': 12, #6
    'tf_efficientnetv2_l_in21ft1k': 6,
    'tf_efficientnet_b2_ns': 12,
    'tf_efficientnet_b3_ns': 16,
    'tf_efficientnet_b4_ns': 12,
    'tf_efficientnet_b5_ns': 8,
}

In [None]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1

    size = SIZE
    bbox_format = "yolo"
    root_dir = DATA_PATH + f"train_{SIZE}/"
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    save_weights = True

    # k-fold
    k = 5
    folds_col = "kfold"
    selected_folds = [0, 1, 2, 3, 4]

    # Model
    selected_model = 'tf_efficientnetv2_m_in21ft1k'
    use_unet = False
    pretrained = False
    num_classes = len(CLASSES)

    # Training
    loss_config = {  
        "w_bce": 0.75,
        "w_seg_loss": 0.95 if "v2_s" in selected_model else 0.75,
        "seg_loss_multiplier": 4 if "v2_s" in selected_model else 2,
        "w_study": 2,
        "w_img": 1,
        "pl_cls_w": 1,
        'w_aux_loss': 1,
    }
    use_fp16 = False if "v2_s" in selected_model else True
    samples_per_patient = 0
    optimizer = ["Adam", "Adam"]
    batch_size = BATCH_SIZES[selected_model]
    epochs = [15]
    iter_per_epochs = 2500
    extra_prop = 0.75

    lr = [1e-3, 1e-4, 1e-5]
    warmup_prop = [0.05, 0.25, 0.5]
    val_bs = batch_size * 2

    first_epoch_eval = 0

    mix = "cutmix"
    mix_proba = 0
    mix_alpha = 0.4

    name = "model"

In [None]:
DEBUG = False
log_folder = None

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f'Logging results to {log_folder}')
    save_config(Config, log_folder + 'config')
    df.to_csv(log_folder + 'data.csv', index=False)
    create_logger(directory=log_folder, name="logs.txt")

pred_oof_study, pred_oof_img = k_fold(
    Config,
    df,
    df_extra=df_ext,
    log_folder=log_folder
)