**About** : This notebook is used to train RNN models.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

### Imports

In [None]:
import os
import sys
import glob
import json
import torch
import operator
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.metrics import *
from collections import Counter
from scipy.stats import spearmanr

warnings.simplefilter(action="ignore", category=UserWarning)

In [None]:
from util.logger import (
    prepare_log_folder,
    save_config,
    create_logger,
)

from data.dataset import FeatureDataset
from params import *
from data.preparation import *
from util.logger import Config as ConfigInf
from training.main_lvl2 import k_fold
from util.metrics import *

In [None]:
from model_zoo.models_lvl2 import define_model
from training.losses import StudyLoss
from util.metrics import rsna_loss

### Data

In [None]:
df = prepare_data_lvl2()

if "fold" not in df.columns:
    folds = pd.read_csv("../input/train_folded_v1.csv")
    df = df.merge(folds, how="left")

In [None]:
df = df[df['fold'] == 0].reset_index(drop=True)

In [None]:
EXP_FOLDERS = {
    # "scs_crop": "../logs/2024-08-29/15/",  # 15
    # "nfn_crop": "../logs/2024-08-29/16/",  # 30
    "scs_crop_coords":  "../logs/2024-08-29/17/",
    # "nfn_crop_coords":  "../logs/2024-08-29/18/",
    "ss_crop_coords": "../logs/2024-09-10/11/", 

    # "dh": '../output/oof____cfg_dh_12s4c.pth',  # Darragh preds
    # "ch": '../output/oof_cfg_ch_35.pth',  # Dieter preds
    "crop": "../logs/2024-08-29/5/",  # 75
}

In [None]:
config = ConfigInf(json.load(open(EXP_FOLDERS["crop"] + "config.json", "r")))

In [None]:
df.head(1)

In [None]:
dataset = FeatureDataset(df, EXP_FOLDERS, targets=CLASSES)

In [None]:
dataset[0]

In [None]:
# p = []
# for i in tqdm(range(len(dataset))):
#     p_ = dataset[i][0]["nfn_crop_coords"].view(-1, 3).numpy()
#     # p_ = p_.reshape(5, 2, 3).transpose(1, 0, 2).reshape(10, 3)
#     p.append(p_)
# p = np.array(p)

# y = df[df.columns[8:18]].values

# aucs = []
# for i, c in enumerate(CLASSES[5:15]):
#     auc = disk_auc(y[:, i], p[:, i])
#     print(f'{c} AUC: \t {auc :.4f}')
#     aucs.append(auc)
# print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [None]:
for i in tqdm(range(len(dataset))):
    fts, y, _ = dataset[i]
    # for k in fts:
    #     print(k, fts[k].size())
    # break

In [None]:
idx = 2
fts, y, _ = dataset[idx]

In [None]:
for k in fts:
    print(k, fts[k].size())

In [None]:
# plt.figure(figsize=(20, 5))
# plt.subplot(1, 3, 1)
# for i in range(5):
#     plt.plot(fts['ss_aux'].softmax(1)[:, i], label=LEVELS[i])
# plt.legend()

# plt.subplot(1, 3, 2)
# for i in [1, 2]:
#     plt.plot(fts['ss'][:, i], label=f'left_{i}')
# for i in [4, 5]:
#     plt.plot(fts['ss'][:, i], label=f'right_{i - 3}')
# plt.legend()

# # plt.subplot(1, 3, 3)
# # for i in range(5):
# #     plt.plot(fts['ss'][:, 1, i], label=LEVELS[i])
# # plt.legend()

# plt.show()

In [None]:
# for k in fts:
#     print(k, fts[k].shape)

### Model

In [None]:
# model = define_model(
#     name="simple",
#     num_classes=len(CLASSES) * 3,
#     layer_dim=0,
#     ft_dim=64,
#     n_fts=45 + 75,
#     resize=10,
# )

In [None]:
# x = {k: fts[k].unsqueeze(0) for k in fts}

# pred, _ = model(x)
# pred.size()

### Training

In [None]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1
    device = "cuda"
    save_weights = True
    targets = CLASSES

    # Data
    exp_folders = {
        # "scs_crop": "../logs/2024-08-29/15/",  # 15
        # "nfn_crop": "../logs/2024-08-29/16/",  # 30
        # "scs_crop_coords":  "../logs/2024-08-29/17/",
        # "nfn_crop_coords":  "../logs/2024-08-29/18/",

        # # "scs_crop_coords": "../logs/2024-09-12/1/",  # 5f -0.005 scs
        "scs_crop_coords_2": "../logs/2024-09-12/9/",  # 3f -0.005 scs
    
        "dh": '../output/oof____cfg_dh_12s4c.pth',  # Darragh preds
        "dh_2": "../output/oof____cfg_dh_19a.pth",  # Darragh preds
        # # "dh": "../output/oof____cfg_dh_15c.pth",  # Darragh preds
        # # "dh": "../output/oof____cfg_dh_15c_2seed.pth",  # Darragh preds
        
        "ch": '../output/oof_cfg_ch_35.pth',  # Dieter preds

        # "crop": "../logs/2024-09-12/21/",  # coatnet side 5fs2
        "crop": "../logs/2024-09-13/7/",  # coatnet side 7fs2  <---- best
        # "crop_2": "../logs/2024-09-13/1/",  # coatnet side 7fs2

    }
    n_fts = 0
    resize = 0
    remove_noisy = False

    # k-fold
    k = 4
    folds_file = "../input/train_folded_v1.csv"  # f"../input/folds_{k}.csv"
    selected_folds = [0, 1, 2, 3]

    # Model
    name = "simple"
    dense_dim = 4096
    layer_dim = 0
    ft = 6 + 3 * ("dh" in exp_folders) + 3 * ("ch" in exp_folders)
    ft_dim = [
        ft + 3 * len([k for k in exp_folders if "scs" in k]),
        ft + 3 * len([k for k in exp_folders if "nfn" in k]),
        ft + 3 * len([k for k in exp_folders if "nfn" in k]),
    ]  # scs, nfn, ss

    p = 0.
    num_classes = len(CLASSES) * 3
    num_classes_aux = 0

    # Training    
    loss_config = {
        "name": "study",
        "weighted": True,
        "use_any": True,
        "smoothing": 0,
        "activation": "study",
        "aux_loss_weight": 0,
        "name_aux": "",
        "smoothing_aux": 0,
        "activation_aux": "",
    }

    data_config = {
        "batch_size": 128,
        "val_bs": 512,
        "mix": "mixup",
        "mix_proba": 0.,
        "sched": False,
        "mix_alpha": 4.,
        "additive_mix": False,
        "num_classes": num_classes,
        "num_classes_aux": num_classes_aux,
        "num_workers": 8,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": 7e-5,  # 5e-5
        "warmup_prop": 0.,
        "betas": (0.9, 0.999),
        "max_grad_norm": 1.,
        "weight_decay": 1,
    }

    epochs = 15

    use_fp16 = True
    verbose = 1
    verbose_eval = 20

    fullfit = False
    n_fullfit = 1

    local_rank = 0
    distributed = False
    world_size = 1

In [None]:
DEBUG = True
log_folder = None

In [None]:
df = prepare_data_lvl2()

if "fold" not in df.columns:
    folds = pd.read_csv(Config.folds_file)
    df = df.merge(folds, how="left")

# df = df[~df['study_id'].isin([1215498865, 1647904243, 2570933394, 2761048584, 3284652867, 3941522676])].reset_index(drop=True)

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    config_df = save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

preds = k_fold(Config, df, log_folder=log_folder, run=None)

### Eval

In [None]:
avg_loss, losses = rsna_loss(df[Config.targets].values, preds)

for k, v in losses.items():
    print(f"- {k}_loss\t: {v:.3f}")

print(f'\n -> CV Score : {avg_loss :.4f}')

- scs_loss	: 0.267
- nfn_loss	: 0.484
- ss_loss	: 0.552
- any_loss	: 0.264

 -> CV Score : 0.3915

In [None]:
preds = np.load("../logs/2024-09-09/2/pred_oof.npy")

In [None]:
# yy = pkl["target"].contiguous()[order].cpu().numpy().clip(-1, 2)
# for i in range(len(df)):
#     if not (df[Config.targets].values[i] == yy[i]).all():
#         print(i, yy[i].tolist(), df[Config.targets].values[i].tolist())
#         display(df.iloc[[i]])
#         # break

In [None]:
# pkl = torch.load('../output/oof_cfg_ch_35.pth')
# pkl = torch.load('../output/oof____cfg_dh_12s1.pth')

# df = df[~df['study_id'].isin([1215498865, 1647904243, 2570933394, 2761048584, 3284652867, 3941522676])].reset_index(drop=True)

# order = [pkl['study_id'].tolist().index(s) for s in df['study_id'].values]

# avg_loss, losses = rsna_loss(df[Config.targets].values, pkl["logits"].cpu().float().softmax(-1).numpy()[order])

# for k, v in losses.items():
#     print(f"- {k}_loss\t: {v:.3f}")

# print(f'\n -> CV Score : {avg_loss :.4f}')

In [None]:
# aucs = []
# for i, c in enumerate(CLASSES):
#     auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
#     print(f'{c} AUC: \t {auc :.4f}')
#     aucs.append(auc)
# print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [None]:
# aucs = []
# for i, c in enumerate(CLASSES):
#     auc = disk_auc(df[Config.targets].values[:, i], preds_dd[:, i])
#     print(f'{c} AUC: \t {auc :.4f}')
#     aucs.append(auc)
# print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [None]:
# preds = preds[df['fold'].values == 0]
# df = df[df['fold'].values == 0]

In [None]:
p = preds.reshape(preds.shape[0], 5, 5, 3)
p = p.transpose(0, 2, 1, 3)
p = p.reshape(-1, 5, 3)

y = df[Config.targets].values
y = y.reshape(preds.shape[0], 5, 5)
y = y.transpose(0, 2, 1)
y = y.reshape(-1, 5)

aucs = []
for i, c in enumerate(CLASSES_CROP):
    auc = disk_auc(y[:, i], p[:, i])
    print(f'{c} AUC: \t {auc :.4f}')
    aucs.append(auc)
print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [None]:
for i, c in enumerate(LEVELS_):
    print(c, disk_auc(y[:, i], p[:, i]))

In [None]:
for i in [5, 6, 7, 8, 9, 15, 16, 17, 18, 19]:
    j = i + 5
    c = CLASSES[i]
    c2 = CLASSES[j]
    
    ref_auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
    auc = disk_auc(df[Config.targets].values[:, i], preds[:, j])
    r = (
        spearmanr(preds[:, i][:, 1], preds[:, j][:, 1]).statistic + 
        spearmanr(preds[:, i][:, 2], preds[:, j][:, 2]).statistic
    ) / 2
    print(f'\npred: {c2} \t truth: {c}')
    # print(r)
    y1 = df[Config.targets].values[:, i]
    y2 = df[Config.targets].values[:, j]
    
    eq = (y1[(y1 > 0) & (y2 > 0)] == y2[(y1 > 0) & (y2 > 0)]).mean()
    print(f'Ref AUC          : {ref_auc :.4f}')
    print(f'Swap AUC         : {auc :.4f}')
    print(f'Equal proportion : {eq:.3f}')
    print(f'Preds correlation: {r:.3f}')


# for j in [5, 6, 7, 8, 9, 15, 16, 17, 18, 19]:
#     i = j + 5
#     c = CLASSES[i]
#     c2 = CLASSES[j]
    
#     ref_auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
#     auc = disk_auc(df[Config.targets].values[:, i], preds[:, j])
#     r = (
#         spearmanr(preds[:, i][:, 1], preds[:, j][:, 1]).statistic + 
#         spearmanr(preds[:, i][:, 2], preds[:, j][:, 2]).statistic
#     ) / 2
#     print(f'\npred: {c2} \t truth: {c}')
#     # print(r)
#     y1 = df[Config.targets].values[:, i]
#     y2 = df[Config.targets].values[:, j]
    
#     eq = (y1[(y1 > 0) & (y2 > 0)] == y2[(y1 > 0) & (y2 > 0)]).mean()
#     print(f'Ref AUC          : {ref_auc :.4f}')
#     print(f'Swap AUC         : {auc :.4f}')
#     print(f'Equal proportion : {eq:.3f}')
#     print(f'Preds correlation: {r:.3f}')

In [None]:
losses = []
y = df[Config.targets].values
for i in tqdm(range(len(df))):
    l = rsna_loss(y[i:i+1], preds[i:i+1])[1]
    l.update({"study": df["study_id"].values[i]})
    losses.append(l)
losses = pd.DataFrame(losses)

In [None]:
plt.figure(figsize=(20, 5))
for i, c in enumerate(losses.columns[:-1]):
    plt.subplot(1, 4, i + 1)
    sns.histplot(losses[c].values)
    plt.title(c)
plt.show()

In [None]:
losses[losses["scs"] > 2]

In [None]:
df[df["study_id"] == 1972129014]

In [None]:
losses[losses["any"] > 2]

- scs_loss	: 0.325
- nfn_loss	: 0.517
- ss_loss	: 0.634
- any_loss	: 0.297

 -> CV Score : 0.443

Done ! 