**About** : This notebook is used to train RNN models.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src/

/workspace/kaggle_rsna_lumbar_spine/src


### Imports

In [3]:
import os
import sys
import glob
import json
import torch
import operator
import warnings
import numpy as np
import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.metrics import *
from collections import Counter
from scipy.stats import spearmanr

warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter("ignore", FutureWarning)

In [4]:
from util.logger import (
    prepare_log_folder,
    save_config,
    create_logger,
)

from data.dataset import FeatureDataset
from params import *
from data.preparation import *
from util.logger import Config as ConfigInf
from training.main_lvl2 import k_fold
from util.metrics import *

In [5]:
from model_zoo.models_lvl2 import define_model
from training.losses import StudyLoss
from util.metrics import rsna_loss

### Data

In [6]:
df = prepare_data_lvl2()

if "fold" not in df.columns:
    folds = pd.read_csv("../input/train_folded_v1.csv")
    df = df.merge(folds, how="left")

In [7]:
# df = df[df['fold'] == 0].reset_index(drop=True)

In [8]:
EXP_FOLDERS = {
    # "scs_crop": "../logs/2024-08-29/15/",  # 15
    # "nfn_crop": "../logs/2024-08-29/16/",  # 30
    # "scs_crop_coords":  "../logs/2024-08-29/17/",
    
    # "nfn_crop_coords":  "../logs/2024-09-15/2/",

    "scs_crop_coords": "../logs/2024-09-12/1/",  # 5f -0.005 scs
    "scs_crop_coords_2": "../logs/2024-10-02/2/",  # 3f -0.005 scs

    # "dh": '../output/oof____cfg_dh_12y8.pth',  # Darragh preds
    # "dh_2": "../output/oof____cfg_dh_29a2.pth",  # Darragh preds
    # "ch": '../output/oof_cfg_ch_35.pth',  # Dieter preds

    "crop_2": "../logs/2024-09-13/7/",  # coatnet side
    "crop": "../logs/2024-09-14/5/",   # coatnet side  <---- best
    # "crop": "../logs/2024-09-30/7/",   # coatnet side more eps
    
    # "crop_ax": "../logs/2024-09-30/6/", 
    # "crop_2": "../logs/2024-09-30/1/", 
}

In [9]:
config = ConfigInf(json.load(open(EXP_FOLDERS["crop"] + "config.json", "r")))

In [10]:
df.head(1)

Unnamed: 0,study_id,series_id,series_description,spinal_canal_stenosis_l1_l2,spinal_canal_stenosis_l2_l3,spinal_canal_stenosis_l3_l4,spinal_canal_stenosis_l4_l5,spinal_canal_stenosis_l5_s1,left_neural_foraminal_narrowing_l1_l2,left_neural_foraminal_narrowing_l2_l3,...,left_subarticular_stenosis_l2_l3,left_subarticular_stenosis_l3_l4,left_subarticular_stenosis_l4_l5,left_subarticular_stenosis_l5_s1,right_subarticular_stenosis_l1_l2,right_subarticular_stenosis_l2_l3,right_subarticular_stenosis_l3_l4,right_subarticular_stenosis_l4_l5,right_subarticular_stenosis_l5_s1,fold
0,4003253,"[702807833, 1054713880, 2448190387]","[Sagittal T2/STIR, Sagittal T1, Axial T2]",0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [11]:
dataset = FeatureDataset(df, EXP_FOLDERS, targets=CLASSES)

In [12]:
dataset[0]

({'scs_crop_coords': tensor([[ 5.0742, -2.6191, -3.7480],
          [ 5.1602, -2.6445, -3.7617],
          [ 5.0586, -2.5293, -3.7500],
          [ 3.9863, -2.0371, -3.1738],
          [ 5.2383, -2.7285, -3.8594]]),
  'scs_crop_coords_2': tensor([[ 5.3789, -1.8828, -2.9570],
          [ 5.4141, -1.8945, -2.9648],
          [ 5.4258, -1.9072, -2.9941],
          [ 5.4180, -1.8135, -3.0840],
          [ 5.4180, -1.8496, -3.0312]]),
  'crop_2': tensor([[ 4.9492, -2.1816, -3.0645,  4.7852, -2.1250, -2.9570],
          [ 4.9961, -2.2402, -3.1016,  4.9922, -2.2617, -3.1035],
          [ 4.9531, -2.1406, -3.0293,  4.8789, -2.1680, -2.9082],
          [ 4.0078, -1.1123, -3.3867,  4.0391, -1.2676, -3.0117],
          [ 4.5430, -2.0527, -2.6582,  4.4297, -2.0195, -2.5059],
          [ 3.7539, -1.6240, -3.9922,  3.7344, -1.6533, -3.9707],
          [ 3.8828, -1.6846, -4.1289,  3.8828, -1.6855, -4.1875],
          [ 3.7207, -1.5234, -4.1562,  3.5000, -1.2744, -4.0938],
          [ 1.0986,  1.0254,

In [13]:
# p = []
# for i in tqdm(range(len(dataset))):
#     p_ = dataset[i][0]["nfn_crop_coords"].view(-1, 3).numpy()
#     # p_ = p_.reshape(5, 2, 3).transpose(1, 0, 2).reshape(10, 3)
#     p.append(p_)
# p = np.array(p)

# y = df[df.columns[8:18]].values

# aucs = []
# for i, c in enumerate(CLASSES[5:15]):
#     auc = disk_auc(y[:, i], p[:, i])
#     print(f'{c} AUC: \t {auc :.4f}')
#     aucs.append(auc)
# print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [14]:
for i in tqdm(range(len(dataset))):
    fts, y, _ = dataset[i]
    # for k in fts:
    #     print(k, fts[k].size())
    break

  0%|          | 0/1975 [00:00<?, ?it/s]

In [15]:
idx = 2
fts, y, _ = dataset[idx]

In [16]:
for k in fts:
    print(k, fts[k].size())

scs_crop_coords torch.Size([5, 3])
scs_crop_coords_2 torch.Size([5, 3])
crop_2 torch.Size([25, 6])
crop torch.Size([25, 6])


In [17]:
# plt.figure(figsize=(20, 5))
# plt.subplot(1, 3, 1)
# for i in range(5):
#     plt.plot(fts['ss_aux'].softmax(1)[:, i], label=LEVELS[i])
# plt.legend()

# plt.subplot(1, 3, 2)
# for i in [1, 2]:
#     plt.plot(fts['ss'][:, i], label=f'left_{i}')
# for i in [4, 5]:
#     plt.plot(fts['ss'][:, i], label=f'right_{i - 3}')
# plt.legend()

# # plt.subplot(1, 3, 3)
# # for i in range(5):
# #     plt.plot(fts['ss'][:, 1, i], label=LEVELS[i])
# # plt.legend()

# plt.show()

In [18]:
# for k in fts:
#     print(k, fts[k].shape)

### Model

In [19]:
model = define_model(
    name="simple",
    num_classes=len(CLASSES) * 3,
    ft_dim=[18, 18, 18],
)

In [20]:
x = {k: fts[k].unsqueeze(0) for k in fts}

In [21]:
# pred, _ = model(x)
# pred.size()

In [22]:
# l = StudyLoss()
# l(pred, y.unsqueeze(0))

In [23]:
# rsna_loss(y.unsqueeze(0).numpy(), pred.softmax(2).detach().numpy())

### Training


In [24]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1
    device = "cuda"
    save_weights = True
    targets = CLASSES

    # Data
    exp_folders = {
        # "scs_crop": "../logs/2024-08-29/15/",  # 15
        # "nfn_crop": "../logs/2024-08-29/16/",  # 30
        # "scs_crop_coords":  "../logs/2024-08-29/17/",
        
        # "nfn_crop_coords":  "../logs/2024-09-15/2/",

        "scs_crop_coords": "../logs/2024-09-12/1/",  # 5f -0.005 scs
        "scs_crop_coords_2": "../logs/2024-10-02/2/",  # 3f -0.005 scs

        # "dh": '../output/oof____cfg_dh_12y8.pth',  # Darragh preds
        # "dh_2": "../output/oof____cfg_dh_29a2.pth",  # Darragh preds
        # "ch": '../output/oof_cfg_ch_35.pth',  # Dieter preds

        "crop_2": "../logs/2024-09-13/7/",  # coatnet side
        "crop": "../logs/2024-09-14/5/",   # coatnet side  <---- best
        # "crop": "../logs/2024-09-30/7/",   # coatnet side more eps
        
        # "crop_ax": "../logs/2024-09-30/6/", 
        # "crop_2": "../logs/2024-09-30/1/", 
    }
    n_fts = 0
    resize = 0
    remove_noisy = False

    # k-fold
    k = 4
    folds_file = "../input/train_folded_v1.csv"  # f"../input/folds_{k}.csv"
    selected_folds = [0, 1, 2, 3]

    # Model
    name = "simple"
    dense_dim = 4096
    layer_dim = 0
    ft = 6 * ("crop" in exp_folders) + 3 * ("dh" in exp_folders) + 3 * ("ch" in exp_folders)
    ft_dim = [
        ft + 3 * len([k for k in exp_folders if "scs" in k]),
        ft + 3 * len([k for k in exp_folders if "nfn" in k]),
        ft + 3 * len([k for k in exp_folders if "ss" in k]),
    ]  # scs, nfn, ss

    p = 0.
    num_classes = len(CLASSES) * 3
    num_classes_aux = 0

    # Training    
    loss_config = {
        "name": "study",
        "weighted": True,
        "use_any": True,
        "smoothing": 0,
        "activation": "study",
        "aux_loss_weight": 0,
        "name_aux": "",
        "smoothing_aux": 0,
        "activation_aux": "",
    }

    data_config = {
        "batch_size": 128,
        "val_bs": 512,
        "mix": "mixup",
        "mix_proba": 0.,
        "sched": False,
        "mix_alpha": 4.,
        "additive_mix": False,
        "num_classes": num_classes,
        "num_classes_aux": num_classes_aux,
        "num_workers": 8,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": 7e-5,  # 5e-5
        "warmup_prop": 0.,
        "betas": (0.9, 0.999),
        "max_grad_norm": 1.,
        "weight_decay": 1,
    }

    epochs = 15

    use_fp16 = True
    verbose = 1
    verbose_eval = 20

    fullfit = True
    n_fullfit = 1

    local_rank = 0
    distributed = False
    world_size = 1

In [25]:
DEBUG = False
log_folder = None

In [26]:
df = prepare_data_lvl2()

if "fold" not in df.columns:
    folds = pd.read_csv(Config.folds_file)
    df = df.merge(folds, how="left")

# df = df[~df['study_id'].isin([1215498865, 1647904243, 2570933394, 2761048584, 3284652867, 3941522676])].reset_index(drop=True)

In [27]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f"Logging results to {log_folder}")
    config_df = save_config(Config, log_folder + "config.json")
    create_logger(directory=log_folder, name="logs.txt")

preds = k_fold(Config, df, log_folder=log_folder, run=None)

Logging results to ../logs/2024-10-03/2/

-------------   Fold 1 / 4  -------------

    -> 1481 training studies
    -> 494 validation studies
    -> 147465 trainable parameters

Epoch 02/15 (step 0020) 	lr=6.2e-05 	 t=1s 	 loss=0.592    scs_loss=0.282    nfn_loss=0.496    ss_loss=0.595    any_loss=0.238	 val_loss=0.403
Epoch 04/15 (step 0040) 	lr=5.3e-05 	 t=1s 	 loss=0.408    scs_loss=0.246    nfn_loss=0.474    ss_loss=0.551    any_loss=0.236	 val_loss=0.377
Epoch 06/15 (step 0060) 	lr=4.5e-05 	 t=1s 	 loss=0.397    scs_loss=0.241    nfn_loss=0.473    ss_loss=0.551    any_loss=0.235	 val_loss=0.375
Epoch 08/15 (step 0080) 	lr=3.6e-05 	 t=1s 	 loss=0.396    scs_loss=0.239    nfn_loss=0.473    ss_loss=0.549    any_loss=0.235	 val_loss=0.374
Epoch 09/15 (step 0100) 	lr=2.8e-05 	 t=0s 	 loss=0.395    scs_loss=0.238    nfn_loss=0.472    ss_loss=0.549    any_loss=0.234	 val_loss=0.373
Epoch 11/15 (step 0120) 	lr=2.0e-05 	 t=1s 	 loss=0.391    scs_loss=0.238    nfn_loss=0.472    ss_loss=0.

### Eval

In [33]:
# y = df[Config.targets].values * 0
# y[:600] = 2
# y[-550:] = 1

# avg_loss, losses = rsna_loss(y, preds * 0 + 0.3333333)

# for k, v in losses.items():
#     print(f"- {k}_loss\t: {v:.3f}")

# print(f'\n -> CV Score : {avg_loss :.4f}')

In [34]:
avg_loss, losses = rsna_loss(df[Config.targets].values, preds)

for k, v in losses.items():
    print(f"- {k}_loss\t: {v:.3f}")

print(f'\n -> CV Score : {avg_loss :.4f}')

- scs_loss	: 0.262
- nfn_loss	: 0.476
- ss_loss	: 0.538
- any_loss	: 0.261

 -> CV Score : 0.3843


In [28]:
avg_loss, losses = rsna_loss(df[Config.targets].values, preds)

for k, v in losses.items():
    print(f"- {k}_loss\t: {v:.3f}")

print(f'\n -> CV Score : {avg_loss :.4f}')

- scs_loss	: 0.264
- nfn_loss	: 0.477
- ss_loss	: 0.545
- any_loss	: 0.262

 -> CV Score : 0.3871


In [36]:
for fold in range(4):
    ids = df[df['fold'] == fold].index.values

    p = preds[ids]
    avg_loss, losses = rsna_loss(df[Config.targets].values[ids], p)
    
    for k, v in losses.items():
        print(f"- {k}_loss\t: {v:.3f}")
    
    print(f'\n -> CV Score : {avg_loss :.4f}')

TypeError: rsna_loss() got an unexpected keyword argument 'verbose'

In [195]:
p.shape

(493, 25, 3)

In [199]:
for fold in range(4):
    ids = df[df['fold'] == fold].index.values

    p = preds[ids].copy()

    p[:, :5, 2] += 0.1
    # p[:, :5, 1] -= 0.1
    # p[:, :5, 0] -= 0.2
    p[:, :5] /= p[:, :5].sum(-1, keepdims=True)

    avg_loss, losses = rsna_loss(df[Config.targets].values[ids], p)
    print()
    for k, v in losses.items():
        print(f"- {k}_loss\t: {v:.3f}")
    
    print(f'\n -> CV Score : {avg_loss :.4f}\n')

tensor(0.1741) tensor(0.2965)

- scs_loss	: 0.322
- nfn_loss	: 0.475
- ss_loss	: 0.549
- any_loss	: 0.288

 -> CV Score : 0.4081

tensor(0.2227) tensor(0.2889)

- scs_loss	: 0.369
- nfn_loss	: 0.484
- ss_loss	: 0.558
- any_loss	: 0.357

 -> CV Score : 0.4421

tensor(0.1579) tensor(0.2929)

- scs_loss	: 0.316
- nfn_loss	: 0.502
- ss_loss	: 0.578
- any_loss	: 0.272

 -> CV Score : 0.4172

tensor(0.1846) tensor(0.3124)

- scs_loss	: 0.366
- nfn_loss	: 0.500
- ss_loss	: 0.563
- any_loss	: 0.293

 -> CV Score : 0.4305



In [51]:
avg_loss, losses = rsna_loss(df[Config.targets].values, preds)

for k, v in losses.items():
    print(f"- {k}_loss\t: {v:.3f}")

print(f'\n -> CV Score : {avg_loss :.4f}')

- scs_loss	: 0.262
- nfn_loss	: 0.477
- ss_loss	: 0.539
- any_loss	: 0.259

 -> CV Score : 0.3841


In [29]:
# yy = pkl["target"].contiguous()[order].cpu().numpy().clip(-1, 2)
# for i in range(len(df)):
#     if not (df[Config.targets].values[i] == yy[i]).all():
#         print(i, yy[i].tolist(), df[Config.targets].values[i].tolist())
#         display(df.iloc[[i]])
#         # break

In [75]:
# pkl = torch.load('../output/oof_cfg_ch_35.pth')
# pkl = torch.load('../output/oof____cfg_dh_12s1.pth')

# df = df[~df['study_id'].isin([1215498865, 1647904243, 2570933394, 2761048584, 3284652867, 3941522676])].reset_index(drop=True)

# order = [pkl['study_id'].tolist().index(s) for s in df['study_id'].values]

# avg_loss, losses = rsna_loss(df[Config.targets].values, pkl["logits"].cpu().float().softmax(-1).numpy()[order])

# for k, v in losses.items():
#     print(f"- {k}_loss\t: {v:.3f}")

# print(f'\n -> CV Score : {avg_loss :.4f}')

In [31]:
# aucs = []
# for i, c in enumerate(CLASSES):
#     auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
#     print(f'{c} AUC: \t {auc :.4f}')
#     aucs.append(auc)
# print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [32]:
# aucs = []
# for i, c in enumerate(CLASSES):
#     auc = disk_auc(df[Config.targets].values[:, i], preds_dd[:, i])
#     print(f'{c} AUC: \t {auc :.4f}')
#     aucs.append(auc)
# print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [33]:
# preds = preds[df['fold'].values == 0]
# df = df[df['fold'].values == 0]

In [34]:
EXP_FOLDER = "../logs/2024-09-02/27/"

p = np.load(EXP_FOLDER + "pred_inf_0.npy").reshape(-1, 10, 3)
df_val = pd.read_csv(EXP_FOLDER + "df_val_0.csv")

y = df_val['target'].values.reshape(-1, 10)

aucs = []
for i, c in enumerate(CLASSES[5:15]):
    auc = disk_auc(y[:, i], p[:, i])
    print(f'{c} AUC: \t {auc :.4f}')
    aucs.append(auc)
print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

FileNotFoundError: [Errno 2] No such file or directory: '../logs/2024-09-02/27/pred_inf_0.npy'

In [None]:
p = preds.reshape(preds.shape[0], 5, 5, 3)
p = p.transpose(0, 2, 1, 3)
p = p.reshape(-1, 5, 3)

y = df[Config.targets].values
y = y.reshape(preds.shape[0], 5, 5)
y = y.transpose(0, 2, 1)
y = y.reshape(-1, 5)

aucs = []
for i, c in enumerate(CLASSES_CROP):
    auc = disk_auc(y[:, i], p[:, i])
    print(f'{c} AUC: \t {auc :.4f}')
    aucs.append(auc)
print(f'\n-> Avg AUC: \t {np.mean(aucs) :.4f}')

In [None]:
for i, c in enumerate(LEVELS_):
    print(c, disk_auc(y[:, i], p[:, i]))

In [None]:
preds[:10]

In [None]:
df_val.head(20)

- scs_loss	: 0.299
- nfn_loss	: 0.500
- ss_loss	: 0.593
- any_loss	: 0.291

 -> CV Score : 0.4206

In [None]:
for i in [5, 6, 7, 8, 9, 15, 16, 17, 18, 19]:
    j = i + 5
    c = CLASSES[i]
    c2 = CLASSES[j]
    
    ref_auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
    auc = disk_auc(df[Config.targets].values[:, i], preds[:, j])
    r = (
        spearmanr(preds[:, i][:, 1], preds[:, j][:, 1]).statistic + 
        spearmanr(preds[:, i][:, 2], preds[:, j][:, 2]).statistic
    ) / 2
    print(f'\npred: {c2} \t truth: {c}')
    # print(r)
    y1 = df[Config.targets].values[:, i]
    y2 = df[Config.targets].values[:, j]
    
    eq = (y1[(y1 > 0) & (y2 > 0)] == y2[(y1 > 0) & (y2 > 0)]).mean()
    print(f'Ref AUC          : {ref_auc :.4f}')
    print(f'Swap AUC         : {auc :.4f}')
    print(f'Equal proportion : {eq:.3f}')
    print(f'Preds correlation: {r:.3f}')


# for j in [5, 6, 7, 8, 9, 15, 16, 17, 18, 19]:
#     i = j + 5
#     c = CLASSES[i]
#     c2 = CLASSES[j]
    
#     ref_auc = disk_auc(df[Config.targets].values[:, i], preds[:, i])
#     auc = disk_auc(df[Config.targets].values[:, i], preds[:, j])
#     r = (
#         spearmanr(preds[:, i][:, 1], preds[:, j][:, 1]).statistic + 
#         spearmanr(preds[:, i][:, 2], preds[:, j][:, 2]).statistic
#     ) / 2
#     print(f'\npred: {c2} \t truth: {c}')
#     # print(r)
#     y1 = df[Config.targets].values[:, i]
#     y2 = df[Config.targets].values[:, j]
    
#     eq = (y1[(y1 > 0) & (y2 > 0)] == y2[(y1 > 0) & (y2 > 0)]).mean()
#     print(f'Ref AUC          : {ref_auc :.4f}')
#     print(f'Swap AUC         : {auc :.4f}')
#     print(f'Equal proportion : {eq:.3f}')
#     print(f'Preds correlation: {r:.3f}')

In [None]:
losses = []
y = df[Config.targets].values
for i in tqdm(range(len(df))):
    l = rsna_loss(y[i:i+1], preds[i:i+1])[1]
    l.update({"study": df["study_id"].values[i]})
    losses.append(l)
losses = pd.DataFrame(losses)

In [None]:
plt.figure(figsize=(20, 5))
for i, c in enumerate(losses.columns[:-1]):
    plt.subplot(1, 4, i + 1)
    sns.histplot(losses[c].values)
    plt.title(c)
plt.show()

In [None]:
losses[losses["scs"] > 2]

In [None]:
df[df["study_id"] == 1972129014]

In [None]:
losses[losses["any"] > 2]

- scs_loss	: 0.325
- nfn_loss	: 0.517
- ss_loss	: 0.634
- any_loss	: 0.297

 -> CV Score : 0.443

Done ! 