**About** : This notebook is used to validate models.

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import torch

print(torch.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
device = torch.cuda.get_device_name(0)
print(device)

In [None]:
import os
import sys
import glob
import json
import timm
import torch
import operator
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.metrics import *
from collections import Counter
from numerize.numerize import numerize

pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

In [None]:
from params import *
from util.plots import *
from util.metrics import *

from data.dataset import ContrailDataset
from data.transforms import get_transfos
from data.preparation import prepare_data
from util.plots import plot_sample
from util.logger import upload_to_kaggle

from inference_main import kfold_inference, Config

### Data

In [None]:
df = prepare_data(DATA_PATH, processed_folder="false_color/", use_raw=True)

In [None]:
if "fold" not in df.columns:
    folds = pd.read_csv(DATA_PATH + "folds_4.csv")
    df = df.merge(folds)

In [None]:
df_val = df[df['fold'] == 0].reset_index(drop=True)

In [None]:
df_val[['record_id']].to_csv('../output/df_val.csv', index=False)

In [None]:
DEBUG = False

folders = [
    3687499407028137410,
    6558861185867890815,
    7355354609194882312,
    7547747455642200110,
    5456834089979970017,
    1501528175573804219,
    5728069425727341010,
    8604370548989406919,
    4746167155668084215,
    6094972442472366517,
]

if DEBUG:
    df_val = df_val[df_val["record_id"].isin(folders)].reset_index(drop=True)

In [None]:
dataset = ContrailDataset(df_val, get_transfos())

gts = []
for i in tqdm(range(len(dataset))):
    image, mask, _ = dataset[i]
    gts.append(mask[:1].numpy())
    
#     if not (i % 1000):
#         plot_sample(image.numpy().transpose(1, 2, 0), mask[:1].numpy().transpose(1, 2, 0))
gts = np.array(gts)

### Inference

In [None]:
USE_FP16 = True
SAVE = not DEBUG

In [None]:
# config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))

In [None]:
# EXP_FOLDER = "../logs/2023-07-03/35/"  # v2-s stride 2          - 0.6845
# EXP_FOLDER = "../logs/2023-07-05/35/"  # convnext-nano cont     - 0.6844
# EXP_FOLDER = "../logs/2023-07-07/8/"   # v2-s stride 1 pl_mask  - 0.6806
# EXP_FOLDER = "../logs/2023-07-07/10/"  # convnext-nano pl_mask  - 0.6853
# EXP_FOLDER = "../logs/2023-07-07/14/"  # v2-s stride 2 pl_mask  - 0.6827


# EXP_FOLDER = "../logs/2023-07-09/2/"  # v2-s stride 2 repro     - 0.6811

EXP_FOLDER = "../logs/2023-07-11/13/"  # v2-s lstm frames 01234 stride 1 - 0.6822
EXP_FOLDER = "../logs/2023-07-11/20/"  # v2-s lstm frames 01234 stride 1 - 0.6817
EXP_FOLDER = "../logs/2023-07-11/24/"  # v2-s lstm frames 234   stride 1 - 0.6794
EXP_FOLDER = "../logs/2023-07-11/26/"  # v2-s lstm frames 234   stride 2 - 0.6846
EXP_FOLDER = "../logs/2023-07-11/28/"  # v2-s lstm frames 1234  stride 2 - 0.6833
EXP_FOLDER = "../logs/2023-07-11/37/"  # cvnxt-nano lstm frames 1234 stride 2 - 0.6853
EXP_FOLDER = "../logs/2023-07-11/39/"  # cvnxt-nano lstm frames 1234 stride 2 - 0.6849


EXP_FOLDER = "../logs/2023-07-12/10/"  # v2-s cnn frames 23456 stride 1 - 0.6812

EXP_FOLDER = "../logs/2023-07-12/11/"  # v2-s cnn frames 345 stride 2 - 0.6843
EXP_FOLDER = "../logs/2023-07-12/13/"  # v2-s lstm frames 2345 stride 2 - 0.6866      <------- ?
EXP_FOLDER = "../logs/2023-07-12/14/"  # v2-s lstm cnn frames 2345 stride 2 - 0.6827
EXP_FOLDER = "../logs/2023-07-12/15/"  # v2-s cnn frames 1234 stride 2 - 0.6830

EXP_FOLDER = "../logs/2023-07-12/16/" # cvnxt-nano cnn frames 1234 stride 2 - 0.6863       <-------
EXP_FOLDER = "../logs/2023-07-12/17/" # cvnxt-nano cnn frames 345 stride 2 - 0.6843
EXP_FOLDER = "../logs/2023-07-12/21/" # cvnxt-nano lstm cnn frames 1234 stride 2 - 0.6860
EXP_FOLDER = "../logs/2023-07-12/22/" # cvnxt-nano lstm frames 2345 stride 2 - 0.6865

EXP_FOLDER = "../logs/2023-07-13/2/" # cvnxt-nano transfo frames 2345 stride 2 - 0.6837
EXP_FOLDER = "../logs/2023-07-13/3/" # cvnxt-nano transfo frames 1234 stride 2 - 0.6842
EXP_FOLDER = "../logs/2023-07-13/4/" # v2-s transfo frames 1234 stride 2 - 0.6829
EXP_FOLDER = "../logs/2023-07-13/5/" # v2-s transfo frames 2345 stride 2 - 0.6825

In [None]:
# 5 runs - cvnxt-nano

# 2D
EXP_FOLDER = "../logs/2023-07-17/10/"
EXP_FOLDER = "../logs/2023-07-18/4/" 
EXP_FOLDER = "../logs/2023-07-18/7/" 
EXP_FOLDER = "../logs/2023-07-18/12/" 
EXP_FOLDER = "../logs/2023-07-19/1/" 

# CNN frames 1234
EXP_FOLDER = "../logs/2023-07-18/0/" 
EXP_FOLDER = "../logs/2023-07-18/6/" 
EXP_FOLDER = "../logs/2023-07-18/10/" 
EXP_FOLDER = "../logs/2023-07-19/0/" 
EXP_FOLDER = "../logs/2023-07-19/2/" 

# LSTM frames 1234
EXP_FOLDER = "../logs/2023-07-19/10/"
EXP_FOLDER = "../logs/2023-07-19/12/"
EXP_FOLDER = "../logs/2023-07-19/15/"
EXP_FOLDER = "../logs/2023-07-19/17/"

In [None]:
# 5 runs - v2s

# 2D
EXP_FOLDER = "../logs/2023-07-19/22/"
EXP_FOLDER = "../logs/2023-07-19/21/"
EXP_FOLDER = "../logs/2023-07-19/20/"
EXP_FOLDER = "../logs/2023-07-19/18/"
EXP_FOLDER = "../logs/2023-07-20/1/"

# LSTM frames 2345
EXP_FOLDER = "../logs/2023-07-19/25/"
EXP_FOLDER = "../logs/2023-07-19/24/"
EXP_FOLDER = "../logs/2023-07-19/23/"
EXP_FOLDER = "../logs/2023-07-19/19/"
EXP_FOLDER = "../logs/2023-07-20/2/"

# LSTM frames 01234567 stride 1
EXP_FOLDER = "../logs/2023-07-22/8/"
EXP_FOLDER = "../logs/2023-07-22/10/"
EXP_FOLDER = "../logs/2023-07-22/16/"
EXP_FOLDER = "../logs/2023-07-23/3/"
EXP_FOLDER = "../logs/2023-07-23/7/" 

# v2s ext stride 2
EXP_FOLDER = "../logs/2023-07-29/1/"
EXP_FOLDER = "../logs/2023-07-29/0/"
EXP_FOLDER = "../logs/2023-07-28/10/"
EXP_FOLDER = "../logs/2023-07-28/9/" 
EXP_FOLDER = "../logs/2023-07-29/13/" 
EXP_FOLDER = "../logs/2023-07-30/0/" 

EXP_FOLDER = "../logs/2023-07-30/6/"


In [None]:
preds = kfold_inference(df_val if DEBUG else df, EXP_FOLDER, use_fp16=USE_FP16, save=SAVE)

In [None]:
# preds = []
# os.makedirs(EXP_FOLDER + "pl_masks/", exist_ok=True)

# for fold in [1, 2, 3]:
#     print(f"\n- Fold {fold + 1}")
#     df_ = df[df['fold'] == fold].reset_index(drop=True)
    
#     pred_val = np.load(EXP_FOLDER + f"pred_val_{fold}.npy")
    
#     for i in tqdm(range(len(df_))):
#         mask = pred_val[i][0].astype(np.float32)
#         record_id = df_['record_id'][i]
        
#         np.save(EXP_FOLDER + "pl_masks/" + str(record_id) + ".npy", mask)

In [None]:
# preds_gpu = torch.from_numpy(preds[0]).cuda()
# truths_gpu = torch.from_numpy(gts).cuda()

# dices = {}
# for th in tqdm(np.round(np.arange(0.2, 0.6, 0.01), 2)):
#     dices[th] = dice_score_torch((preds_gpu > th), truths_gpu)
    
# th, dice = max(dices.items(), key=operator.itemgetter(1))
# print(f'-> CV dice :  {dice:.4f}   (th={th:.2f})')

In [None]:
# preds_gpu = torch.from_numpy(preds[0]).cuda()
# truths_gpu = torch.from_numpy(gts).cuda()

# dices = {}
# for th in tqdm(np.round(np.arange(0.2, 0.6, 0.01), 2)):
#     dices[th] = dice_score_torch((preds_gpu > th), truths_gpu)
    
# th, dice = max(dices.items(), key=operator.itemgetter(1))
# print(f'-> CV dice :  {dice:.4f}   (th={th:.2f})')

### Blends

In [None]:
EXP_FOLDERS = [
# #     "../logs/2023-07-03/35/",  # v2-s stride 2          - 0.6845
# #     "../logs/2023-07-05/35/",  # convnext-nano cont     - 0.6844

#     "../logs/2023-07-11/13/",  # v2-s lstm frames 01234 stride 1 - 0.6822
# #     "../logs/2023-07-11/20/",  # v2-s lstm frames 01234 stride 1 - 0.6817
# #     "../logs/2023-07-12/10/",  # v2-s cnn frames 23456 stride 1 - 0.6812

#     "../logs/2023-07-11/26/",  # v2-s lstm frames 234   stride 2 - 0.6846
# #     "../logs/2023-07-11/28/",  # v2-s lstm frames 1234  stride 2 - 0.6833
#     "../logs/2023-07-11/37/",  # cvnxt-nano lstm frames 1234 stride 2 - 0.6853
#     "../logs/2023-07-11/39/",  # cvnxt-nano lstm frames 1234 stride 2 - 0.6849
    
#     "../logs/2023-07-12/11/",  # v2-s cnn frames 345 stride 2 - 0.6843
#     "../logs/2023-07-12/13/",  # v2-s lstm frames 2345 stride 2 - 0.6866
#     "../logs/2023-07-12/16/", # cvnxt-nano cnn frames 1234 stride 2 - 0.6863
# #     "../logs/2023-07-12/17/", # cvnxt-nano cnn frames 345 stride 2 - 0.6843
#     "../logs/2023-07-12/21/", # cvnxt-nano lstm cnn frames 1234 stride 2 - 0.6860
#     "../logs/2023-07-12/22/", # cvnxt-nano lstm frames 2345 stride 2 - 0.6865

#     "../logs/2023-07-13/2/", # cvnxt-nano transfo frames 2345 stride 2 - 0.6837
#     "../logs/2023-07-13/3/", # cvnxt-nano transfo frames 1234 stride 2 - 0.6842
#     "../logs/2023-07-13/4/", # v2-s transfo frames 1234 stride 2 - 0.68
    
    
    "../logs/2023-07-18/0/",
    "../logs/2023-07-19/10/",
    "../logs/2023-07-19/12/",
]

PREDS = {f: torch.from_numpy(np.load(f + "pred_val_0.npy")).cuda() for f in tqdm(EXP_FOLDERS)}

In [None]:
truths_gpu = torch.from_numpy(gts).cuda()

used = []
for f1 in tqdm(EXP_FOLDERS):
    for f2 in EXP_FOLDERS[EXP_FOLDERS.index(f1):]:
        for f3 in EXP_FOLDERS[EXP_FOLDERS.index(f2):]:
#             for f4 in EXP_FOLDERS[EXP_FOLDERS.index(f3) + 1:]:
            files = sorted(list(set([f1, f2, f3]))) # , f4]
            preds_gpu = torch.stack([PREDS[f] for f in files], 0).mean(0)

            dices = {}
            for th in np.round(np.arange(0.4, 0.5, 0.01), 2):
                dices[th] = dice_score_torch((preds_gpu > th), truths_gpu)
            th, dice = max(dices.items(), key=operator.itemgetter(1))

            if dice > 0.6995:
                print("\t".join(files), f'\t-  CV dice :  {dice:.4f}   (th={th:.2f})')
                used += files

In [None]:
count = Counter(sorted(used))

print('Not used :', [f for f in EXP_FOLDERS if f not in count])
count

In [None]:
EXP_FOLDERS = [  # 0.7016
    "../logs/2023-07-11/13/",  # v2-s lstm frames 01234 stride 1 - 0.6822
    "../logs/2023-07-11/26/",  # v2-s lstm frames 234 stride 2 - 0.6846
    "../logs/2023-07-12/13/",  # v2-s lstm frames 2345 stride 2 - 0.6866
    "../logs/2023-07-12/16/",  # cvnxt-nano cnn frames 1234 stride 2 - 0.6863
    "../logs/2023-07-12/21/",  # cvnxt-nano cnn+lstm frames 1234 stride 2 - 0.6860
]

In [None]:
EXP_FOLDERS = [  # 0.6997
    "../logs/2023-07-11/26/",  # v2-s lstm frames 234   stride 2 - 0.6846
    "../logs/2023-07-12/16/", # cvnxt-nano cnn frames 1234 stride 2 - 0.6863
]

In [None]:
EXP_FOLDERS = [
    "../logs/2023-07-18/0/",
    "../logs/2023-07-19/10/",
    "../logs/2023-07-19/12/",
]

In [None]:
# try:
#     preds = np.average([
#         np.load(f + "pred_val_0.npy") for f in EXP_FOLDERS
#     ], axis=0, weights=WEIGHTS)
#     print('Weighted avg !')
# except:
preds = torch.stack([PREDS[f] for f in EXP_FOLDERS], 0).mean(0)

In [None]:
truths_gpu = torch.from_numpy(gts).cuda()

dices = {}
for th in np.round(np.arange(0.2, 0.6, 0.01), 2):
    dices[th] = dice_score_torch((preds > th), truths_gpu)
    
th, dice = max(dices.items(), key=operator.itemgetter(1))
print(f'-> CV dice :  {dice:.4f}   (th={th:.2f})')

In [None]:
plt.plot(np.array(list(dices.keys())).astype(float), dices.values())
plt.axvline(th, c="salmon")
plt.xlim(th - 0.1, th + 0.1)
plt.ylim(dice - 0.01, dice + 0.002)
plt.title(f'dice={dice:.3f}, th={th:.2f}')
plt.show()

In [None]:
# upload_to_kaggle(EXP_FOLDERS, "../output/dataset_v1/", "Contrail Weights v1", update_folders=True)

### Multiple seeds

In [None]:
# 5 runs - cvnxt-nano cnn frames 1234 stride 2 - 0.6863
EXP_FOLDERS = [
    # cvnxt-nano 
#     # 2D - 0.6855
#     "../logs/2023-07-17/10/",
#     "../logs/2023-07-18/4/" ,
#     "../logs/2023-07-18/7/" ,
#     "../logs/2023-07-18/12/" ,
#     "../logs/2023-07-19/1/" ,

# #     # CNN 1234 - 0.6900
#     "../logs/2023-07-18/0/" ,
#     "../logs/2023-07-18/6/" ,
#     "../logs/2023-07-18/10/" ,
#     "../logs/2023-07-19/0/" ,
#     "../logs/2023-07-19/2/", 

#     # LSTM 1234 - 0.6890
#     "../logs/2023-07-19/10/",
#     "../logs/2023-07-19/12/",
#     "../logs/2023-07-19/15/",
#     "../logs/2023-07-19/17/",
    
    # v2s
#     # 2D
#     "../logs/2023-07-19/22/",
#     "../logs/2023-07-19/21/",
#     "../logs/2023-07-19/20/",
#     "../logs/2023-07-19/18/",
#     "../logs/2023-07-20/1/",

#     # LSTM 2345  - 0.6905
#     "../logs/2023-07-19/25/",
#     "../logs/2023-07-19/24/",
#     "../logs/2023-07-19/23/",
#     "../logs/2023-07-19/19/",
#     "../logs/2023-07-20/2/",
    
#     # LSTM 01234567  - 0.6860   
#     "../logs/2023-07-22/8/",
#     "../logs/2023-07-22/10/",
#     "../logs/2023-07-22/16/",
#     "../logs/2023-07-23/3/",
#     "../logs/2023-07-23/7/" ,
    
    # Ext data 100 epochs
    "../logs/2023-07-29/1/",
#     "../logs/2023-07-29/0/",
#     "../logs/2023-07-28/10/",   # 200 ep
    "../logs/2023-07-28/9/" ,
    "../logs/2023-07-29/13/",
    "../logs/2023-07-30/0/",
    "../logs/2023-07-30/6/",
]

PREDS = {f: torch.from_numpy(np.load(f + "pred_val_0.npy")).cuda() for f in tqdm(EXP_FOLDERS)}

In [None]:
# upload_to_kaggle(EXP_FOLDERS, "../output/dataset_v1/", "Contrail Weights v1", update_folders=True)

In [None]:
truths_gpu = torch.from_numpy(gts).cuda()
scores = []

for exp_folder in EXP_FOLDERS:
    preds = PREDS[exp_folder]

    dices = {}
    for th in np.round(np.arange(0.2, 0.6, 0.01), 2):
        dices[th] = dice_score_torch((preds > th), truths_gpu)

    th, dice = max(dices.items(), key=operator.itemgetter(1))
    print(f'-> {exp_folder}\t-  CV dice :  {dice:.4f}   (th={th:.2f})')
    scores.append(dice)
    
print(f'\n  --> Avg of CV : {np.mean(scores):.4f} +/- {np.std(scores):.4f}')


preds = torch.stack([PREDS[f] for f in EXP_FOLDERS], 0).mean(0)

dices = {}
for th in np.round(np.arange(0.2, 0.6, 0.01), 2):
    dices[th] = dice_score_torch((preds > th), truths_gpu)
    
th, dice = max(dices.items(), key=operator.itemgetter(1))
print(f'  --> CV of avg : {dice:.4f}   (th={th:.2f})')

In [None]:
plt.plot(np.array(list(dices.keys())).astype(float), dices.values())
plt.axvline(th, c="salmon")
plt.xlim(th - 0.1, th + 0.1)
plt.ylim(dice - 0.01, dice + 0.002)
plt.title(f'dice={dice:.3f}, th={th:.2f}')
plt.show()

### Plot errors
- TODO

Done ! 