**About** : This notebook is used to train models.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import torch

print(torch.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
device = torch.cuda.get_device_name(0)
print(device)

In [None]:
import os
import sys
import glob
import json
import torch
import operator
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.metrics import *

In [None]:
from util.logger import (
    prepare_log_folder,
    save_config,
    create_logger,
#     init_neptune,
)

from params import *
from data.dataset import *
from data.preparation import *
from util.metrics import rsna_loss
from model_zoo.models_lvl2 import define_model

from training.main_lvl2 import k_fold

## Data

In [None]:
df_patient, df_img = prepare_data(DATA_PATH)

In [None]:
EXP_FOLDERS = [
    "../logs/2023-08-27/3/",  # v2-s
]
EXP_FOLDER = EXP_FOLDERS[0]

In [None]:
from inference.extract_features import Config
config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))

In [None]:
if "fold" not in df_patient.columns:
    folds = pd.read_csv(config.folds_file)
    df_img = df_img.merge(folds)
    df_patient = df_patient.merge(folds)

In [None]:
dataset = PatientFeatureDataset(df_patient[df_patient['fold'] == 0], df_img[df_img['fold'] == 0], EXP_FOLDERS, max_len=1000)

In [None]:
# fts, y, _ = dataset[0]
# fts.size(), y

In [None]:
# lens = []
# for i in tqdm(range(len(dataset))):
#     x = dataset[i][0]
#     lens.append(len(x))
    
# #     break

In [None]:
# sns.histplot(lens)

## Model

In [None]:
model = define_model("rnn", ft_dim=1280, layer_dim=128, dense_dim=256, num_classes=11, num_classes_aux=0)

In [None]:
# x = torch.cat([fts.unsqueeze(0)] * 2)

# pred, pred_aux = model(x)
# pred.size()

## Training
- dynamic batch trimming ?
- SWA
- Transformer ?

In [None]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1
    device = "cuda"
    save_weights = True

    # Data
    exp_folders = [
#         "../logs/2023-09-06/0/",  # v2-s
        "../logs/2023-08-27/3/",  # v2-s fix sampling
    ]
    max_len = 1000
    n_fts = 0  # already pooled features, not supported yet

    # k-fold
    k = 4
    folds_file = f"../input/folds_{k}.csv"
    selected_folds = [0, 1, 2, 3]

    # Model
    name = "rnn"
    ft_dim = 1280

    layer_dim = 256
    dense_dim = 256

    p = 0.
    use_msd = False
    num_classes = 11
    num_classes_aux = 0

    # Training    
    loss_config = {
        "name": "patient",
        "smoothing": 0,
        "activation": "patient",
        "aux_loss_weight": 0,
        "name_aux": "patient",
        "smoothing_aux": 0,
        "activation_aux": "",
    }

    data_config = {
        "batch_size": 64,
        "val_bs": 256,
        "mix": "mixup",
        "mix_proba": 0.,
        "mix_alpha": 4.,
        "additive_mix": False,
        "num_classes": num_classes,
        "num_workers": 8,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": 3e-4,
        "warmup_prop": 0.,
        "betas": (0.9, 0.999),
        "max_grad_norm": 10.,
        "weight_decay": 0.,
    }

    epochs = 10

    use_fp16 = True
    verbose = 1
    verbose_eval = 50

    fullfit = False
    n_fullfit = 1
    
    local_rank = 0
    distributed = False
    world_size = 1

In [None]:
DEBUG = True
log_folder = None

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    config_df = save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

preds, preds_aux = k_fold(Config, df_patient, df_img, log_folder=log_folder, run=None)

### Eval

In [None]:
LOG_FOLDER = "../logs/2023-09-04/2/"

In [None]:
config = Config

In [None]:
dfs = []
for fold in tqdm(config.selected_folds):

    df_val = df_patient[df_patient['fold'] == fold]
    
    dataset = PatientFeatureDataset(df_val, df_img[df_img['fold'] == fold], config.exp_folders)
    patients = [d[0] for d in dataset.ids]
    df_preds = pd.DataFrame({"patient_id": patients})
    
    preds = np.load(LOG_FOLDER + f"pred_val_{fold}.npy")

    preds_cols = []
    for i in range(preds.shape[1]):
        preds_cols.append(f'pred_{i}')
        df_preds[f'pred_{i}'] = preds[:, i]

    df_preds = df_preds.groupby('patient_id').mean()
    df = df_val.merge(df_preds, on="patient_id")

    dfs.append(df)

In [None]:
df_oof = pd.concat(dfs, ignore_index=True)
pred_oof = df_oof[preds_cols].values

In [None]:
losses, avg_loss = rsna_loss(pred_oof, df_oof)

for k, v in losses.items():
    print(f"- {k.split('_')[0][:8]} loss\t: {v:.3f}")
    
print(f'\n -> CV Score : {avg_loss :.3f}')

In [None]:
mapping = {'bowel_injury': 0, 'extravasation_injury': 1, 'kidney': 2, 'liver': 5, 'spleen': 8}
for tgt in PATIENT_TARGETS:
    if "injury" in tgt:
        auc = roc_auc_score(df_oof[tgt] > 0, pred_oof[:, mapping[tgt]])
    else:
        auc = roc_auc_score(df_oof[tgt] <= 0, pred_oof[:, mapping[tgt]])
        
    print(f'- {tgt} auc : {auc:.3f}')

In [None]:
pred_oof_ = pred_oof.copy()
losses, avg_loss = rsna_loss(pred_oof, df_oof)
best_score = avg_loss

for _ in range(2):
    factors = []
    for i in range(pred_oof.shape[1]):
        scores = {}
        for factor in np.round(np.arange(0.5, 1.5, 0.1), 2):
            for shift in np.round(np.arange(-0.1, 0.11, 0.1), 2):
#             for shift in [-0.1, 0, 0.1]:
                pred_oof_r = pred_oof_.copy()
                pred_oof_r[:, i] = pred_oof_r[:, i] * factor + shift
                pred_oof_r[:, i] = np.clip(pred_oof_r[:, i], 0.00001, 0.99999)

                losses, avg_loss = rsna_loss(pred_oof_r, df_oof)
                scores[(factor, shift)] = avg_loss

    #     print(scores)
        best_coefs, best_loss = min(scores.items(), key=operator.itemgetter(1))
        pred_oof_[:, i] = pred_oof_[:, i] * best_coefs[0] + best_coefs[1]
        best_score = best_loss
        print(f'{i} - {best_coefs}  -  {best_loss :.3f}')
        factors.append(best_coefs)

In [None]:
losses, avg_loss = rsna_loss(pred_oof_, df_oof)

for k, v in losses.items():
    print(f"- {k.split('_')[0][:8]} loss\t: {v:.3f}")
    
print(f'\n -> CV Score : {avg_loss :.3f}')

In [None]:
dummy = np.array(
    [
        [0.04] * len(df_oof), [0.3] * len(df_oof),
        [0.6] * len(df_oof), [0.05] * len(df_oof), [0.05] * len(df_oof),
        [0.4] * len(df_oof), [0.07] * len(df_oof), [0.03] * len(df_oof),
        [0.3] * len(df_oof), [0.04] * len(df_oof), [0.07] * len(df_oof),
    ]
).T
losses, avg_loss = rsna_loss(dummy, df_oof)

for k, v in losses.items():
    print(f"- {k.split('_')[0][:8]} loss\t: {v:.3f}")
    
print(f'\n -> CV Score : {avg_loss :.3f}')

In [None]:
# for i in range(2):
#     sns.histplot(preds[:, i])
    
#     auc = roc_auc_score(df_val[PATIENT_TARGETS[i]], preds[:, i])
#     print(f'- {PATIENT_TARGETS[i]} auc : {auc:.3f}')
    
#     plt.show()

Done ! 