**About** : This notebook is used to train RNN models.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src/

/workspace/kaggle_rsna_lumbar_spine/src


### Imports

In [3]:
import os
import sys
import glob
import json
import torch
import operator
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.metrics import *
from collections import Counter
from scipy.stats import spearmanr

warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter("ignore", FutureWarning)

In [4]:
from util.logger import (
    prepare_log_folder,
    save_config,
    create_logger,
)

from data.dataset import FeatureDataset
from params import *
from data.preparation import *
from util.logger import Config as ConfigInf
from training.main_lvl2 import k_fold
from util.metrics import *

In [5]:
from model_zoo.models_lvl2 import define_model
from training.losses import StudyLoss
from util.metrics import rsna_loss

### Data

In [6]:
df = prepare_data_lvl2()

if "fold" not in df.columns:
    folds = pd.read_csv("../input/train_folded_v1.csv")
    df = df.merge(folds, how="left")

In [70]:
EXP_FOLDERS = {
    # "scs_crop": "../logs/2024-08-29/15/",  # 15
    # "nfn_crop": "../logs/2024-08-29/16/",  # 30
    # "scs_crop_coords":  "../logs/2024-08-29/17/",
    # "nfn_crop_coords":  "../logs/2024-08-29/18/",

    "dh": '../output/oof____cfg_dh_12s4c.pth',  # Darragh preds
    "dh_2": '../output/oof____cfg_dh_12s4c_seed1.pth',  # Darragh preds
    # "ch": '../output/oof_cfg_ch_35.pth',  # Dieter preds
    
    # "crop_2": "../logs/2024-09-11/1/",
    # "dh": "../output/oof____cfg_dh_19a.pth",  # Darragh preds

    "crop": "../output/2024-08-29_5/",  # 75
    "spinenet": "../output/spinenet_preds.csv",  # 75
}

In [71]:
config = ConfigInf(json.load(open(EXP_FOLDERS["crop"] + "config.json", "r")))

In [72]:
df.head(1)

Unnamed: 0,study_id,series_id,series_description,spinal_canal_stenosis_l1_l2,spinal_canal_stenosis_l2_l3,spinal_canal_stenosis_l3_l4,spinal_canal_stenosis_l4_l5,spinal_canal_stenosis_l5_s1,left_neural_foraminal_narrowing_l1_l2,left_neural_foraminal_narrowing_l2_l3,...,left_subarticular_stenosis_l2_l3,left_subarticular_stenosis_l3_l4,left_subarticular_stenosis_l4_l5,left_subarticular_stenosis_l5_s1,right_subarticular_stenosis_l1_l2,right_subarticular_stenosis_l2_l3,right_subarticular_stenosis_l3_l4,right_subarticular_stenosis_l4_l5,right_subarticular_stenosis_l5_s1,fold
0,4003253,"[702807833, 1054713880, 2448190387]","[Sagittal T2/STIR, Sagittal T1, Axial T2]",0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [73]:
dataset = FeatureDataset(df, EXP_FOLDERS, targets=CLASSES)

In [74]:
# p = []
# for i in tqdm(range(len(dataset))):
#     p_ = dataset[i][0]["nfn_crop_coords"].view(-1, 3).numpy()
#     # p_ = p_.reshape(5, 2, 3).transpose(1, 0, 2).reshape(10, 3)
#     p.append(p_)
# p = np.array(p)

# y = df[df.columns[8:18]].values

# aucs = []
# for i, c in enumerate(CLASSES[5:15]):
#     auc = disk_auc(y[:, i], p[:, i])
#     print(f'{c} AUC: \t {auc :.4f}')
#     aucs.append(auc)
# print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [75]:
for i in tqdm(range(len(dataset))):
    fts, y, _ = dataset[i]
    # for k in fts:
    #     print(k, fts[k].size())
    # break

  0%|          | 0/1975 [00:00<?, ?it/s]

In [76]:
idx = 2
fts, y, _ = dataset[idx]

In [77]:
for k in fts:
    print(k, fts[k].size())

dh torch.Size([25, 3])
dh_2 torch.Size([25, 3])
crop torch.Size([25, 6])
spinenet torch.Size([5, 12])


In [15]:
# plt.figure(figsize=(20, 5))
# plt.subplot(1, 3, 1)
# for i in range(5):
#     plt.plot(fts['ss_aux'].softmax(1)[:, i], label=LEVELS[i])
# plt.legend()

# plt.subplot(1, 3, 2)
# for i in [1, 2]:
#     plt.plot(fts['ss'][:, i], label=f'left_{i}')
# for i in [4, 5]:
#     plt.plot(fts['ss'][:, i], label=f'right_{i - 3}')
# plt.legend()

# # plt.subplot(1, 3, 3)
# # for i in range(5):
# #     plt.plot(fts['ss'][:, 1, i], label=LEVELS[i])
# # plt.legend()

# plt.show()

In [16]:
# for k in fts:
#     print(k, fts[k].shape)

### Model

In [17]:
# model = define_model(
#     name="simple",
#     num_classes=len(CLASSES) * 3,
#     layer_dim=0,
#     ft_dim=64,
#     n_fts=45 + 75,
#     resize=10,
# )

In [18]:
x = {k: fts[k].unsqueeze(0) for k in fts}

In [19]:
# pred, _ = model(x)
# pred.size()

In [20]:
# l = StudyLoss()
# l(pred, y.unsqueeze(0))

In [21]:
# rsna_loss(y.unsqueeze(0).numpy(), pred.softmax(2).detach().numpy())

In [22]:
df_val = pd.read_csv('../logs/2024-09-11/1/df_val_0.csv')
y1 = np.vstack(df_val["target"].apply(eval).values)

In [23]:
df_val = pd.read_csv('../output/2024-08-29_5/df_val_0.csv')
y2 = np.vstack(df_val["target"].apply(eval).values)

In [68]:
from params import NOISY_STUDIES
len(NOISY_STUDIES)

100

### Training


In [85]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1
    device = "cuda"
    save_weights = True
    targets = CLASSES

    # Data
    exp_folders = {
        # "scs_crop": "../logs/2024-08-29/15/",  # 15
        # "nfn_crop": "../logs/2024-08-29/16/",  # 30
        # "scs_crop_coords":  "../logs/2024-08-29/17/",
        
        # "nfn_crop_coords":  "../logs/2024-09-15/2/",

        # "scs_crop_coords": "../logs/2024-09-12/1/",  # 5f -0.005 scs
        # "scs_crop_coords_2": "../logs/2024-09-12/9/",  # 3f -0.005 scs
    
        # "dh": '../output/oof____cfg_dh_12s4c.pth',  # Darragh preds
        # "dh_2": "../output/oof____cfg_dh_19a.pth",  # Darragh preds
        # # # "dh": "../output/oof____cfg_dh_15c.pth",  # Darragh preds
        # # # "dh": "../output/oof____cfg_dh_15c_2seed.pth",  # Darragh preds
        
        # "ch": '../output/oof_cfg_ch_35.pth',  # Dieter preds

        # "crop": "../logs/2024-09-12/21/",  # coatnet side
        # "crop": "../logs/2024-09-13/7/",  # coatnet side
        # "crop_2": "../logs/2024-09-13/1/",  # coatnet side
        "crop": "../logs/2024-09-14/5/",   # coatnet side  <---- best

        "spinenet": "../output/spinenet_preds.csv",

        # "crop": "../logs/2024-09-16/3/", 
    }
    n_fts = 0
    resize = 0
    remove_noisy = False

    # k-fold
    k = 4
    folds_file = "../input/train_folded_v1.csv"  # f"../input/folds_{k}.csv"
    selected_folds = [0]  # , 1, 2, 3]

    # Model
    name = "simple"
    dense_dim = 4096
    layer_dim = 0
    ft = 6 + 3 * ("dh" in exp_folders) + 3 * ("ch" in exp_folders) 
    ft_dim = [
        ft + 3 * len([k for k in exp_folders if "scs" in k]) + 0 * ("spinenet" in exp_folders),
        ft + 3 * len([k for k in exp_folders if "nfn" in k]) + 8 * ("spinenet" in exp_folders),
        ft + 3 * len([k for k in exp_folders if "ss" in k]) + 8 * ("spinenet" in exp_folders),
    ]  # scs, nfn, ss

    p = 0.
    num_classes = len(CLASSES) * 3
    num_classes_aux = 0

    # Training    
    loss_config = {
        "name": "study",
        "weighted": True,
        "use_any": True,
        "smoothing": 0,
        "activation": "study",
        "aux_loss_weight": 0,
        "name_aux": "",
        "smoothing_aux": 0,
        "activation_aux": "",
    }

    data_config = {
        "batch_size": 128,
        "val_bs": 512,
        "mix": "mixup",
        "mix_proba": 0.,
        "sched": False,
        "mix_alpha": 4.,
        "additive_mix": False,
        "num_classes": num_classes,
        "num_classes_aux": num_classes_aux,
        "num_workers": 8,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": 7e-5,  # 5e-5
        "warmup_prop": 0.,
        "betas": (0.9, 0.999),
        "max_grad_norm": 1.,
        "weight_decay": 1,
    }

    epochs = 15

    use_fp16 = True
    verbose = 1
    verbose_eval = 20

    fullfit = False
    n_fullfit = 1

    local_rank = 0
    distributed = False
    world_size = 1

In [86]:
DEBUG = True
log_folder = None

In [87]:
df = prepare_data_lvl2()

if "fold" not in df.columns:
    folds = pd.read_csv(Config.folds_file)
    df = df.merge(folds, how="left")

# df = df[~df['study_id'].isin([1215498865, 1647904243, 2570933394, 2761048584, 3284652867, 3941522676])].reset_index(drop=True)

In [89]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    config_df = save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

preds = k_fold(Config, df, log_folder=log_folder, run=None)


-------------   Fold 1 / 4  -------------

    -> 1481 training studies
    -> 494 validation studies
    -> 188425 trainable parameters

Epoch 02/15 (step 0020) 	lr=6.2e-05 	 t=1s 	 loss=0.659    scs_loss=0.293    nfn_loss=0.561    ss_loss=0.596    any_loss=0.262	 val_loss=0.428
Epoch 04/15 (step 0040) 	lr=5.3e-05 	 t=1s 	 loss=0.417    scs_loss=0.257    nfn_loss=0.486    ss_loss=0.555    any_loss=0.251	 val_loss=0.387
Epoch 06/15 (step 0060) 	lr=4.5e-05 	 t=1s 	 loss=0.409    scs_loss=0.248    nfn_loss=0.480    ss_loss=0.555    any_loss=0.249	 val_loss=0.383
Epoch 08/15 (step 0080) 	lr=3.6e-05 	 t=1s 	 loss=0.403    scs_loss=0.247    nfn_loss=0.478    ss_loss=0.551    any_loss=0.248	 val_loss=0.381
Epoch 09/15 (step 0100) 	lr=2.8e-05 	 t=1s 	 loss=0.398    scs_loss=0.248    nfn_loss=0.477    ss_loss=0.552    any_loss=0.249	 val_loss=0.381
Epoch 11/15 (step 0120) 	lr=2.0e-05 	 t=1s 	 loss=0.401    scs_loss=0.247    nfn_loss=0.477    ss_loss=0.553    any_loss=0.248	 val_loss=0.381
Epo

- 12 cls : scs_loss=0.260    nfn_loss=0.477    ss_loss=0.552    any_loss=0.268	 val_loss=0.390
- No spinenet : scs_loss=0.247    nfn_loss=0.475    ss_loss=0.549    any_loss=0.248	 val_loss=0.380

### Eval

In [84]:
avg_loss, losses = rsna_loss(df[Config.targets].values, preds)

for k, v in losses.items():
    print(f"- {k}_loss\t: {v:.3f}")

print(f'\n -> CV Score : {avg_loss :.4f}')

- scs_loss	: 0.262
- nfn_loss	: 0.476
- ss_loss	: 0.539
- any_loss	: 0.256

 -> CV Score : 0.3835


In [61]:
avg_loss, losses = rsna_loss(df[Config.targets].values, preds)

for k, v in losses.items():
    print(f"- {k}_loss\t: {v:.3f}")

print(f'\n -> CV Score : {avg_loss :.4f}')

- scs_loss	: 0.270
- nfn_loss	: 0.491
- ss_loss	: 0.564
- any_loss	: 0.266

 -> CV Score : 0.3978


In [59]:
avg_loss, losses = rsna_loss(df[Config.targets].values, preds)

for k, v in losses.items():
    print(f"- {k}_loss\t: {v:.3f}")

print(f'\n -> CV Score : {avg_loss :.4f}')

- scs_loss	: 0.274
- nfn_loss	: 0.491
- ss_loss	: 0.565
- any_loss	: 0.275

 -> CV Score : 0.4012


In [108]:
avg_loss, losses = rsna_loss(df[Config.targets].values, preds)

for k, v in losses.items():
    print(f"- {k}_loss\t: {v:.3f}")

print(f'\n -> CV Score : {avg_loss :.4f}')

- scs_loss	: 0.261
- nfn_loss	: 0.474
- ss_loss	: 0.539
- any_loss	: 0.257

 -> CV Score : 0.3829


In [29]:
# yy = pkl["target"].contiguous()[order].cpu().numpy().clip(-1, 2)
# for i in range(len(df)):
#     if not (df[Config.targets].values[i] == yy[i]).all():
#         print(i, yy[i].tolist(), df[Config.targets].values[i].tolist())
#         display(df.iloc[[i]])
#         # break

In [75]:
# pkl = torch.load('../output/oof_cfg_ch_35.pth')
# pkl = torch.load('../output/oof____cfg_dh_12s1.pth')

# df = df[~df['study_id'].isin([1215498865, 1647904243, 2570933394, 2761048584, 3284652867, 3941522676])].reset_index(drop=True)

# order = [pkl['study_id'].tolist().index(s) for s in df['study_id'].values]

# avg_loss, losses = rsna_loss(df[Config.targets].values, pkl["logits"].cpu().float().softmax(-1).numpy()[order])

# for k, v in losses.items():
#     print(f"- {k}_loss\t: {v:.3f}")

# print(f'\n -> CV Score : {avg_loss :.4f}')

In [31]:
# aucs = []
# for i, c in enumerate(CLASSES):
#     auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
#     print(f'{c} AUC: \t {auc :.4f}')
#     aucs.append(auc)
# print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [32]:
# aucs = []
# for i, c in enumerate(CLASSES):
#     auc = disk_auc(df[Config.targets].values[:, i], preds_dd[:, i])
#     print(f'{c} AUC: \t {auc :.4f}')
#     aucs.append(auc)
# print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [33]:
# preds = preds[df['fold'].values == 0]
# df = df[df['fold'].values == 0]

In [124]:
y.shape, preds.shape

((1975, 25), (1975, 25, 3))

In [29]:
# EXP_FOLDER = "../logs/2024-09-02/27/"

# p = np.load(EXP_FOLDER + "pred_inf_0.npy").reshape(-1, 10, 3)
# df_val = pd.read_csv(EXP_FOLDER + "df_val_0.csv")

y = dataset.targets

errors = []
aucs = []
for i, c in enumerate(CLASSES):
    auc = disk_auc(y[:, i], preds[:, i])

    for j in range(3):
        errors.append(np.abs((y[:, i]== j) - preds[:, i, j]))

    
    print(f'{c} AUC: \t {auc :.4f}')
    aucs.append(auc)
print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

spinal_canal_stenosis_l1_l2 AUC: 	 0.9899
spinal_canal_stenosis_l2_l3 AUC: 	 0.9730
spinal_canal_stenosis_l3_l4 AUC: 	 0.9658
spinal_canal_stenosis_l4_l5 AUC: 	 0.9553
spinal_canal_stenosis_l5_s1 AUC: 	 0.9426
left_neural_foraminal_narrowing_l1_l2 AUC: 	 0.9555
left_neural_foraminal_narrowing_l2_l3 AUC: 	 0.9487
left_neural_foraminal_narrowing_l3_l4 AUC: 	 0.8970
left_neural_foraminal_narrowing_l4_l5 AUC: 	 0.8717
left_neural_foraminal_narrowing_l5_s1 AUC: 	 0.8619
right_neural_foraminal_narrowing_l1_l2 AUC: 	 0.9706
right_neural_foraminal_narrowing_l2_l3 AUC: 	 0.9337
right_neural_foraminal_narrowing_l3_l4 AUC: 	 0.9130
right_neural_foraminal_narrowing_l4_l5 AUC: 	 0.8834
right_neural_foraminal_narrowing_l5_s1 AUC: 	 0.8687
left_subarticular_stenosis_l1_l2 AUC: 	 0.9626
left_subarticular_stenosis_l2_l3 AUC: 	 0.9293
left_subarticular_stenosis_l3_l4 AUC: 	 0.9117
left_subarticular_stenosis_l4_l5 AUC: 	 0.8629
left_subarticular_stenosis_l5_s1 AUC: 	 0.8716
right_subarticular_stenosis_l1

In [30]:
errors = np.array(errors).mean(0)
df['error'] = errors

In [33]:
df.sort_values('error', ascending=False).study_id.values[:500]

array([ 783154228,  520900899, 1217004843, 3828017267, 3369277408,
       1743493727, 1301627154,  413910863, 4201106871, 1088270559,
       3713534743, 3469376405,  105895264, 1289563234, 2638691430,
        808294521, 3008676218, 1314603564, 4266523380, 1179643011,
       1891482189, 2668759897,  757619082, 1666601651, 4072191052,
        901299313,  704573554, 1548005561, 2966328820,  959290081,
       2621581337, 1459964234, 3996069892, 2238966046, 1106510276,
        189360935,  341051344,  305152236, 4172077685, 3968285352,
       2046176090,   58813022, 2991382385, 3227154093, 3515641631,
       1791596037, 3740680860, 4058604433, 1871675162, 3781188430,
       3337564969, 3201694970,  325485990,  504362668, 1755159626,
       3029953735, 1972129014,  264945797,  497870715, 3507369254,
       4259049254, 3192842688, 1176604093, 1353517692, 3617361428,
        677879566,  796739553,  885894528,  296083289,  618246392,
       2797118205, 2059107661,  976356113, 1697944783, 4279881

In [None]:
p = preds.reshape(preds.shape[0], 5, 5, 3)
p = p.transpose(0, 2, 1, 3)
p = p.reshape(-1, 5, 3)

y = df[Config.targets].values
y = y.reshape(preds.shape[0], 5, 5)
y = y.transpose(0, 2, 1)
y = y.reshape(-1, 5)

aucs = []
for i, c in enumerate(CLASSES_CROP):
    auc = disk_auc(y[:, i], p[:, i])
    print(f'{c} AUC: \t {auc :.4f}')
    aucs.append(auc)
print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [None]:
for i, c in enumerate(LEVELS_):
    print(c, disk_auc(y[:, i], p[:, i]))

In [None]:
preds[:10]

In [None]:
df_val.head(20)

- scs_loss	: 0.299
- nfn_loss	: 0.500
- ss_loss	: 0.593
- any_loss	: 0.291

 -> CV Score : 0.4206

In [None]:
for i in [5, 6, 7, 8, 9, 15, 16, 17, 18, 19]:
    j = i + 5
    c = CLASSES[i]
    c2 = CLASSES[j]
    
    ref_auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
    auc = disk_auc(df[Config.targets].values[:, i], preds[:, j])
    r = (
        spearmanr(preds[:, i][:, 1], preds[:, j][:, 1]).statistic + 
        spearmanr(preds[:, i][:, 2], preds[:, j][:, 2]).statistic
    ) / 2
    print(f'\npred: {c2} \t truth: {c}')
    # print(r)
    y1 = df[Config.targets].values[:, i]
    y2 = df[Config.targets].values[:, j]
    
    eq = (y1[(y1 > 0) & (y2 > 0)] == y2[(y1 > 0) & (y2 > 0)]).mean()
    print(f'Ref AUC          : {ref_auc :.4f}')
    print(f'Swap AUC         : {auc :.4f}')
    print(f'Equal proportion : {eq:.3f}')
    print(f'Preds correlation: {r:.3f}')


# for j in [5, 6, 7, 8, 9, 15, 16, 17, 18, 19]:
#     i = j + 5
#     c = CLASSES[i]
#     c2 = CLASSES[j]
    
#     ref_auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
#     auc = disk_auc(df[Config.targets].values[:, i], preds[:, j])
#     r = (
#         spearmanr(preds[:, i][:, 1], preds[:, j][:, 1]).statistic + 
#         spearmanr(preds[:, i][:, 2], preds[:, j][:, 2]).statistic
#     ) / 2
#     print(f'\npred: {c2} \t truth: {c}')
#     # print(r)
#     y1 = df[Config.targets].values[:, i]
#     y2 = df[Config.targets].values[:, j]
    
#     eq = (y1[(y1 > 0) & (y2 > 0)] == y2[(y1 > 0) & (y2 > 0)]).mean()
#     print(f'Ref AUC          : {ref_auc :.4f}')
#     print(f'Swap AUC         : {auc :.4f}')
#     print(f'Equal proportion : {eq:.3f}')
#     print(f'Preds correlation: {r:.3f}')

In [None]:
losses = []
y = df[Config.targets].values
for i in tqdm(range(len(df))):
    l = rsna_loss(y[i:i+1], preds[i:i+1])[1]
    l.update({"study": df["study_id"].values[i]})
    losses.append(l)
losses = pd.DataFrame(losses)

In [None]:
plt.figure(figsize=(20, 5))
for i, c in enumerate(losses.columns[:-1]):
    plt.subplot(1, 4, i + 1)
    sns.histplot(losses[c].values)
    plt.title(c)
plt.show()

In [None]:
losses[losses["scs"] > 2]

In [None]:
df[df["study_id"] == 1972129014]

In [None]:
losses[losses["any"] > 2]

- scs_loss	: 0.325
- nfn_loss	: 0.517
- ss_loss	: 0.634
- any_loss	: 0.297

 -> CV Score : 0.443

Done ! 