**About** : This notebook is used to train RNN models.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

### Imports

In [None]:
import os
import sys
import glob
import json
import torch
import operator
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.metrics import *
from collections import Counter
from scipy.stats import spearmanr

warnings.simplefilter(action="ignore", category=UserWarning)

In [None]:
from util.logger import (
    prepare_log_folder,
    save_config,
    create_logger,
)

from data.dataset import FeatureDataset
from params import *
from data.preparation import *
from util.logger import Config as ConfigInf
from training.main_lvl2 import k_fold
from util.metrics import *

In [None]:
from model_zoo.models_lvl2 import define_model
from training.losses import StudyLoss
from util.metrics import rsna_loss

### Data

In [None]:
df = prepare_data_lvl2()

if "fold" not in df.columns:
    folds = pd.read_csv("../input/folds_4.csv")
    df = df.merge(folds, how="left")

In [None]:
if "fold" not in df.columns:
    folds = pd.read_csv("../input/folds_4.csv")
    df = df.merge(folds, how="left")

In [None]:
df.head(3)

In [None]:
EXP_FOLDERS = {
    "nfn": "../logs/2024-08-05/27/",
    "scs": "../logs/2024-08-04/33/",
    "ss": "../logs/2024-08-06/17/",  # NEEDS IMPROVEMENT
    # "ss_aux": "../logs/2024-08-06/17/",
    "scs_crop": "../logs/2024-08-07/19/",
    "nfn_crop": "../logs/2024-08-07/32/",
    # "ss_crop": "../logs/2024-08-20/5/",
    # "scs_crop_coords": "../logs/2024-08-13/1/",
    # "nfn_crop_coords": "../logs/2024-08-13/8/",
    "crop": "../logs/2024-08-21/9/",

}

In [None]:
config = ConfigInf(json.load(open(EXP_FOLDERS["nfn"] + "config.json", "r")))

In [None]:
if "fold" not in df.columns:
    folds = pd.read_csv(config.folds_file)
    df = df.merge(folds)

In [None]:
df.head(1)

In [None]:
dataset = FeatureDataset(df, EXP_FOLDERS, resize=10, targets=CLASSES)

In [None]:
for i in tqdm(range(len(dataset))):
    fts, y, _ = dataset[i]
    for k in fts:
        print(k, fts[k].size())
    break

In [None]:
idx = 0
fts, y, _ = dataset[idx]

In [None]:
for k in fts:
    print(k, fts[k].size())

In [None]:
# plt.figure(figsize=(20, 5))
# plt.subplot(1, 3, 1)
# for i in range(5):
#     plt.plot(fts['ss_aux'].softmax(1)[:, i], label=LEVELS[i])
# plt.legend()

# plt.subplot(1, 3, 2)
# for i in [1, 2]:
#     plt.plot(fts['ss'][:, i], label=f'left_{i}')
# for i in [4, 5]:
#     plt.plot(fts['ss'][:, i], label=f'right_{i - 3}')
# plt.legend()

# # plt.subplot(1, 3, 3)
# # for i in range(5):
# #     plt.plot(fts['ss'][:, 1, i], label=LEVELS[i])
# # plt.legend()

# plt.show()

In [None]:
# for k in fts:
#     print(k, fts[k].shape)

### Model

In [None]:
model = define_model(
    name="baseline",
    num_classes=len(CLASSES) * 3,
    layer_dim=0,
    ft_dim=64,
    n_fts=45 + 75,
    resize=10,
)

In [None]:
x = {k: fts[k].unsqueeze(0) for k in fts}

In [None]:
pred, _ = model(x)
pred.size()

In [None]:
l = StudyLoss()
l(pred, y.unsqueeze(0))

In [None]:
rsna_loss(y.unsqueeze(0).numpy(), pred.softmax(2).detach().numpy())

### Training

In [None]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1
    device = "cuda"
    save_weights = True
    targets = CLASSES

    # Data
    exp_folders = {
        # "nfn": "../logs/2024-08-05/27/",  # NEEDS IMPROVEMENT
        # "scs": "../logs/2024-08-04/33/",  # NEEDS IMPROVEMENT
        # "ss": "../logs/2024-08-06/17/",  # NEEDS IMPROVEMENT

        "scs_crop": "../logs/2024-08-07/19/",  # 15
        "nfn_crop": "../logs/2024-08-07/32/",  # 30
        "scs_crop_coords": "../logs/2024-08-13/1/",  # 15
        "nfn_crop_coords": "../logs/2024-08-13/8/",  # 30

        "crop": "../logs/2024-08-22/11/",  # 75
        # "crop_ax": "../logs/2024-08-26/4/",  # 75  NEEDS IMPROVEMENT
    }
    n_fts = 75 + 45 + 45 # + 75
    resize = 30

    # k-fold
    k = 4
    folds_file = f"../input/folds_{k}.csv"
    selected_folds = [0, 1, 2, 3]

    # Model
    name = "baseline"
    dense_dim = 8192 # - 1024
    layer_dim = 0
    ft_dim = 0

    p = 0.4
    num_classes = len(CLASSES) * 3
    num_classes_aux = 0

    # Training    
    loss_config = {
        "name": "study",
        "weighted": True,
        "use_any": True,
        "smoothing": 0,
        "activation": "study",
        "aux_loss_weight": 0,
        "name_aux": "",
        "smoothing_aux": 0,
        "activation_aux": "",
    }

    data_config = {
        "batch_size": 64,
        "val_bs": 512,
        "mix": "mixup",
        "mix_proba": 0.,
        "sched": False,
        "mix_alpha": 4.,
        "additive_mix": False,
        "num_classes": num_classes,
        "num_classes_aux": num_classes_aux,
        "num_workers": 8,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": 1e-4,
        "warmup_prop": 0.,
        "betas": (0.9, 0.999),
        "max_grad_norm": 1.,
        "weight_decay": 1,
    }

    epochs = 15

    use_fp16 = True
    verbose = 1
    verbose_eval = 20

    fullfit = False
    n_fullfit = 1

    local_rank = 0
    distributed = False
    world_size = 1

In [None]:
DEBUG = True
log_folder = None

In [None]:
df = prepare_data_lvl2()

if "fold" not in df.columns:
    folds = pd.read_csv(Config.folds_file)
    df = df.merge(folds, how="left")

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    config_df = save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

preds = k_fold(Config, df, log_folder=log_folder, run=None)

### Eval

In [None]:
# for fold in range(3, 4):   
#     print(f'\n - Fold {fold + 1}') 
#     idx = df[df['fold'] == fold].index
#     df_val = df.iloc[idx]
#     preds_val = preds[idx]

#     avg_loss, losses = rsna_loss(df_val[Config.targets].values, preds_val)

#     for k, v in losses.items():
#         print(f"- {k}_loss\t: {v:.3f}")

#     print(f'\n -> CV Score : {avg_loss :.4f}')


In [None]:
avg_loss, losses = rsna_loss(df[Config.targets].values, preds)

for k, v in losses.items():
    print(f"- {k}_loss\t: {v:.3f}")

print(f'\n -> CV Score : {avg_loss :.4f}')

In [None]:
aucs = []
for i, c in enumerate(CLASSES):
    auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
    print(f'{c} AUC: \t {auc :.4f}')
    aucs.append(auc)
print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [None]:
p = preds.reshape(preds.shape[0], 5, 5, 3)
p = p.transpose(0, 2, 1, 3)
p = p.reshape(-1, 5, 3)

y = df[Config.targets].values
y = y.reshape(preds.shape[0], 5, 5)
y = y.transpose(0, 2, 1)
y = y.reshape(-1, 5)

aucs = []
for i, c in enumerate(CLASSES_CROP):
    auc = disk_auc(y[:, i], p[:, i])
    print(f'{c} AUC: \t {auc :.4f}')
    aucs.append(auc)
print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [None]:
for i, c in enumerate(LEVELS_):
    print(c, disk_auc(y[:, i], p[:, i]))

- scs_loss	: 0.299
- nfn_loss	: 0.500
- ss_loss	: 0.593
- any_loss	: 0.291

 -> CV Score : 0.4206

In [None]:
for i in [5, 6, 7, 8, 9, 15, 16, 17, 18, 19]:
    j = i + 5
    c = CLASSES[i]
    c2 = CLASSES[j]
    
    ref_auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
    auc = disk_auc(df[Config.targets].values[:, i], preds[:, j])
    r = (
        spearmanr(preds[:, i][:, 1], preds[:, j][:, 1]).statistic + 
        spearmanr(preds[:, i][:, 2], preds[:, j][:, 2]).statistic
    ) / 2
    print(f'\npred: {c2} \t truth: {c}')
    # print(r)
    y1 = df[Config.targets].values[:, i]
    y2 = df[Config.targets].values[:, j]
    
    eq = (y1[(y1 > 0) & (y2 > 0)] == y2[(y1 > 0) & (y2 > 0)]).mean()
    print(f'Ref AUC          : {ref_auc :.4f}')
    print(f'Swap AUC         : {auc :.4f}')
    print(f'Equal proportion : {eq:.3f}')
    print(f'Preds correlation: {r:.3f}')


for j in [5, 6, 7, 8, 9, 15, 16, 17, 18, 19]:
    i = j + 5
    c = CLASSES[i]
    c2 = CLASSES[j]
    
    ref_auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
    auc = disk_auc(df[Config.targets].values[:, i], preds[:, j])
    r = (
        spearmanr(preds[:, i][:, 1], preds[:, j][:, 1]).statistic + 
        spearmanr(preds[:, i][:, 2], preds[:, j][:, 2]).statistic
    ) / 2
    print(f'\npred: {c2} \t truth: {c}')
    # print(r)
    y1 = df[Config.targets].values[:, i]
    y2 = df[Config.targets].values[:, j]
    
    eq = (y1[(y1 > 0) & (y2 > 0)] == y2[(y1 > 0) & (y2 > 0)]).mean()
    print(f'Ref AUC          : {ref_auc :.4f}')
    print(f'Swap AUC         : {auc :.4f}')
    print(f'Equal proportion : {eq:.3f}')
    print(f'Preds correlation: {r:.3f}')

In [None]:
losses = []
y = df[Config.targets].values
for i in tqdm(range(len(df))):
    l = rsna_loss(y[i:i+1], preds[i:i+1])[1]
    l.update({"study": df["study_id"].values[i]})
    losses.append(l)
losses = pd.DataFrame(losses)

In [None]:
plt.figure(figsize=(20, 5))
for i, c in enumerate(losses.columns[:-1]):
    plt.subplot(1, 4, i + 1)
    sns.histplot(losses[c].values)
    plt.title(c)
plt.show()

In [None]:
losses[losses["scs"] > 2]

In [None]:
df[df["study_id"] == 1972129014]

In [None]:
losses[losses["any"] > 2]

- scs_loss	: 0.325
- nfn_loss	: 0.517
- ss_loss	: 0.634
- any_loss	: 0.297

 -> CV Score : 0.443

Done ! 