**About** : This notebook is used to validate models.

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import torch

print(torch.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
torch.cuda.get_device_name(0)

In [None]:
import os
import re
import sys
import glob
import json
import time
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.metrics import *

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
pd.set_option('display.max_columns', 500)

In [None]:
from params import *

from data.preparation import *
from data.dataset import SignDataset

from utils.metrics import *
from utils.plots import *
from utils.logger import Config

from inference.main import kfold_inference_val

### Blend

In [None]:
df = prepare_data(DATA_PATH, "")

In [None]:
# EXP_FOLDER = "../logs/2023-04-17/42/"  # 0.7265 / x5 0.7273 / MTx10 0.7274
# EXP_FOLDER = "../logs/2023-04-18/10/"  # 0.7245 / x5 0.7247 / MTx10 0.7249
# EXP_FOLDER = "../logs/2023-04-19/6/"  # 0.7245 / x10 0.7254 / MTx10 0.7254
# EXP_FOLDER = "../logs/2023-04-20/18/"  # 0.7247 / x10 0.7254 / MTx10 0.7256
# EXP_FOLDER = "../logs/2023-04-20/19/"  # 0.7250 / x10 0.7257 / MTx10 0.7254
# EXP_FOLDER = "../logs/2023-04-20/21/"  # 0.7243 / x10 0.7246 / MTx10 0.7244

FILES = [  #  CV 0.7279 - LB 0.78
    "../logs/2023-04-14/37/pred_oof_dist.npy",  # 0.7225 / Dist 0.7196 (-0.003)
    "../logs/2023-04-15/7/pred_oof_dist.npy",  # 0.7228 / Dist 0.7176 (-0.005)
]

FILES = [  # CV 0.7340 - LB 0.78
    "../logs/2023-04-20/19/pred_oof_inf.npy",  # 0.7257
    "../logs/2023-04-20/18/pred_oof_mt.npy",  # 0.7254
]

FILES = [  # CV 0.7400 - LB 0.78
    "../logs/2023-04-22/7/pred_oof_mt.npy",  # 0.7310
    "../logs/2023-04-23/28/pred_oof_dist.npy",  # 0.7348
]

FILES = [  # CV 0.7439  - 0.78
    "../logs/2023-04-25/71/pred_oof_dist.npy",  # 0.7329 / torch_12
    "../logs/2023-04-28/7/pred_oof_dist.npy",  # 0.7359 / torch_19
]

FILES = [  # CV 0.7438 - 0.79
    "../logs/2023-04-28/5/pred_oof_dist_soup.npy",  # 0.7329 / torch_12
    "../logs/2023-04-28/7/pred_oof_dist_soup.npy",  # 0.7359 / torch_19
]

# FILES = [  # CV 0.7459
#     "../logs/2023-04-28/5/pred_oof_dist_soup.npy",  # 0.7329 / torch_12
#     "../logs/2023-04-28/6/pred_oof_dist_soup.npy",  # 0.7338 / torch_15 s
#     "../logs/2023-04-28/7/pred_oof_dist_soup.npy",  # 0.7359 / torch_19
# ]


FILES = [  # CV 0.7400 - LB 0.78
#     "../logs/2023-04-22/7/pred_oof_mt.npy",  # 0.7310
#     "../logs/2023-04-23/28/pred_oof_dist.npy",  # 0.7348
#     "../logs/2023-04-25/71/pred_oof_dist.npy",  # 0.7352  


#     "../logs/2023-04-23/28/pred_oof_dist.npy",  # DISTx10 0.7348  torch_16

#     "../logs/2023-04-26/0/pred_oof_dist.npy",  # DISTx10 0.7359   torch_16
#     "../logs/2023-04-26/3/pred_oof_dist.npy",  # DIST 0.7355      torch_16
    
#     "../logs/2023-04-27/14/pred_oof_dist.npy",  # 0.7328 /  torch_15
#     "../logs/2023-04-27/15/pred_oof_dist.npy",  # 0.7304 /  torch_12
#     "../logs/2023-04-27/17/pred_oof_dist.npy",  # 0.7329 /  torch_18  -> SUB

#     "../logs/2023-04-27/19/pred_oof_dist_soup.npy",  # 0.7341 /  torch_15  -> SUB
#     "../logs/2023-04-27/19/pred_oof_dist.npy",  # 0.7334 /  torch_15  -> SUB
#     "../logs/2023-04-27/20/pred_oof_dist.npy",  # 0.7328 /  torch_12

#     "../logs/2023-04-28/4/pred_oof_dist_soup.npy",  # 0.7351 / torch_15
#     "../logs/2023-04-28/4/pred_oof_dist.npy",  # 0.7336 / torch_15
    
#      "../logs/2023-04-28/5/pred_oof_dist_soup.npy",  # 0.7329 / torch_12
#     "../logs/2023-04-28/5/pred_oof_dist.npy",  # 0.7321 / torch_12
    
#     "../logs/2023-04-28/6/pred_oof_dist_soup.npy",  # 0.7338 / torch_15 s
#     "../logs/2023-04-28/6/pred_oof_dist.npy",  # 0.7336 / torch_15 s
    
#     "../logs/2023-04-28/7/pred_oof_dist_soup.npy",  # 0.7359 / torch_19
    "../logs/2023-04-28/7/pred_oof_dist.npy",  # 0.7356 / torch_19
]

WEIGHTS = [1, 1, 1]

if len(WEIGHTS) == len(FILES):
    pred_oof = np.average([np.load(f) for f in FILES], weights=WEIGHTS, axis=0)
    print('- Weighted average')
else:
    print('- Simple average')
    pred_oof = np.mean([np.load(f) for f in FILES], 0)

df['pred'] = pred_oof.argmax(-1)

score = accuracy(df['target'], pred_oof)
print(f"\n-> CV acc : {score:.4f}")

In [None]:
FILES = [  # CV 0.7400 - LB 0.78
    # BIG
#     "../logs/2023-04-23/28/pred_oof_dist.npy",  # 0.7348

#     "../logs/2023-04-23/28/pred_oof_dist.npy",  # DISTx10 0.7348  torch_16
    "../logs/2023-04-25/71/pred_oof_dist.npy", # DIST 0.7352  torch_12
    "../logs/2023-04-26/0/pred_oof_dist.npy",  # DIST 0.7359  torch_16
    "../logs/2023-04-26/3/pred_oof_dist.npy",  # DIST 0.7355  torch_16
    
    # SMALLER
    
    "../logs/2023-04-27/14/pred_oof_dist.npy",  # 0.7328 /  torch_15
    "../logs/2023-04-27/17/pred_oof_dist.npy",  # 0.7329 /  torch_18

#     "../logs/2023-04-27/19/pred_oof_dist_soup.npy",  # 0.7341 /  torch_15
    "../logs/2023-04-27/19/pred_oof_dist.npy",  # 0.7334 /  torch_15

    "../logs/2023-04-27/20/pred_oof_dist.npy",  # 0.7328 /  torch_12

#     "../logs/2023-04-28/4/pred_oof_dist_soup.npy",  # 0.7351 / torch_15
    "../logs/2023-04-28/4/pred_oof_dist.npy",  # 0.7336 / torch_15
    
#      "../logs/2023-04-28/5/pred_oof_dist_soup.npy",  # 0.7329 / torch_12
    "../logs/2023-04-28/5/pred_oof_dist.npy",  # 0.7321 / torch_12
    
#     "../logs/2023-04-28/6/pred_oof_dist_soup.npy",  # 0.7338 / torch_15 s
    "../logs/2023-04-28/6/pred_oof_dist.npy",  # 0.7336 / torch_15 s
    
#     "../logs/2023-04-28/7/pred_oof_dist_soup.npy",  # 0.7359 / torch_19
    "../logs/2023-04-28/7/pred_oof_dist.npy",  # 0.7356 / torch_19
]

FILES_ = [np.load(f) for f in tqdm(FILES)]

already_found = []
for i, f1 in enumerate(tqdm(FILES_)):
    for j, f2 in enumerate(FILES_):

        pred_oof = (f1 + f2) / 2
        score = accuracy(df['target'], pred_oof)
        if score > 0.7435:
            found = " ".join(sorted([FILES[i], FILES[j]]))

            if found not in already_found:
                already_found.append(found)
                print(f"\n-> CV acc : {score:.4f}  - {found}")

In [None]:
FILES = [  # CV 0.7400 - LB 0.78
    # BIG
#     "../logs/2023-04-23/28/pred_oof_dist.npy",  # 0.7348
#     "../logs/2023-04-25/71/pred_oof_dist.npy",  # 0.7353
#     "../logs/2023-04-23/28/pred_oof_dist.npy",  # DISTx10 0.7348  torch_16
#     "../logs/2023-04-25/71/pred_oof_dist.npy", # DISTx10  0.7353  torch_12
#     "../logs/2023-04-26/0/pred_oof_dist.npy",  # DISTx10 0.7359   torch_16
#     "../logs/2023-04-26/3/pred_oof_dist.npy",  # DIST 0.7355      torch_16
    
    # SMALLER
    
    "../logs/2023-04-27/14/pred_oof_dist.npy",  # 0.7328 /  torch_15
#     "../logs/2023-04-27/17/pred_oof_dist.npy",  # 0.7329 /  torch_18

    "../logs/2023-04-27/19/pred_oof_dist_soup.npy",  # 0.7341 /  torch_15
#     "../logs/2023-04-27/19/pred_oof_dist.npy",  # 0.7334 /  torch_15

    "../logs/2023-04-27/20/pred_oof_dist.npy",  # 0.7328 /  torch_12

    "../logs/2023-04-28/4/pred_oof_dist_soup.npy",  # 0.7351 / torch_15
#     "../logs/2023-04-28/4/pred_oof_dist.npy",  # 0.7336 / torch_15
    
     "../logs/2023-04-28/5/pred_oof_dist_soup.npy",  # 0.7329 / torch_12
#     "../logs/2023-04-28/5/pred_oof_dist.npy",  # 0.7321 / torch_12
    
    "../logs/2023-04-28/6/pred_oof_dist_soup.npy",  # 0.7338 / torch_15 s
#     "../logs/2023-04-28/6/pred_oof_dist.npy",  # 0.7336 / torch_15 s
    
    "../logs/2023-04-28/7/pred_oof_dist_soup.npy",  # 0.7359 / torch_19
#     "../logs/2023-04-28/7/pred_oof_dist.npy",  # 0.7356 / torch_19
]

FILES_ = [np.load(f) for f in tqdm(FILES)]

# for i, f1 in enumerate(tqdm(FILES_)):
#     for j, f2 in enumerate(FILES_):
#         pred_oof = (f1 + f2) / 2
#         score = accuracy(df['target'], pred_oof)
#         if score > 0.743:
#             print(f"\n-> CV acc : {score:.4f}  - {FILES[i]} - {FILES[j]}")

already_found = []
for i, f1 in enumerate(tqdm(FILES_)):
    for j, f2 in enumerate(FILES_):
        for k, f3 in enumerate(FILES_):
            pred_oof = (f1 + f2 + f3) / 3
            score = accuracy(df['target'], pred_oof)
            if score > 0.746:
                found = " ".join(sorted([FILES[i], FILES[j], FILES[k]]))
                
                if found not in already_found:
                    already_found.append(found)
                    print(f"\n-> CV acc : {score:.4f}  - {found}")

## Expes

In [None]:
EXP_FOLDER = "../logs/2023-04-21/31/"  # 0.7267
# EXP_FOLDER = "../logs/2023-04-22/2/"  # 0.7310
EXP_FOLDER = "../logs/2023-04-23/28/"  # 0.7348 dist

EXP_FOLDER = "../logs/2023-04-25/66/"

In [None]:
config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))

In [None]:
df = prepare_data(DATA_PATH, config.processed_folder)
df = prepare_data(DATA_PATH, "torch_15/")

In [None]:
if "fold" not in df.columns:
    folds = pd.read_csv(config.folds_file)
    df = df.merge(folds, how="left", on=["participant_id", "sequence_id"])

In [None]:
# pred_oof = np.load(EXP_FOLDER + "pred_oof.npy")
# pred_oof = np.load(EXP_FOLDER + "pred_oof_dist.npy")

df['pred'] = pred_oof.argmax(-1)

score = accuracy(df['target'], pred_oof)
print(f"-> CV acc : {score:.4f}")

In [None]:
dataset = SignDataset(df, max_len=None)
# dataset.fill_buffer(tqdm_enabled=True)

In [None]:
# lens = [len(v) for v in tqdm(dataset.buffer.values())]
# # np.save('../output/lens.npy', np.array(lens))
# # lens = np.load('../output/lens.npy')
# df['len'] = lens

# sns.countplot(x=lens)

In [None]:
df['error'] = (df['target'] != df['pred'])

In [None]:
df['pred_40'] = np.load("../logs/2023-04-27/14/pred_oof_dist.npy").argmax(-1)  # 0.7328 /  torch_15
df['pred_80'] = np.load("../logs/2023-04-27/17/pred_oof_dist.npy").argmax(-1)  # 0.7329 /  torch_18
df['pred_20'] = np.load("../logs/2023-04-27/20/pred_oof_dist.npy").argmax(-1)  # 0.7328 /  torch_12

df['error_40'] = (df['target'] != df['pred_40'])
df['error_80'] = (df['target'] != df['pred_80'])
df['error_20'] = (df['target'] != df['pred_20'])

df['error_20'].mean(), df['error_40'].mean(), df['error_80'].mean()

In [None]:
# dfg = df.groupby('participant_id').agg(['mean', 'count', 'sum'])[['error']]
# dfg.sort_values(('error',  'mean'))

In [None]:
(df['len'] > 80).mean()

In [None]:
# df['len'] = np.clip(df['len'] // 10 * 10 + 10, 0, 200)
df['len'] = np.clip(df['len'], 0, 100)
plt.figure(figsize=(10, 5))
sns.countplot(x=df['len'])

In [None]:
dfg = df.groupby('len').agg(['mean', 'sum'])[['error_20', 'error_40', 'error_80']]
# dfg.sort_values(('error',  'mean'))

In [None]:
dfg

In [None]:
plt.figure(figsize=(15, 10))
plt.scatter(dfg.index, dfg[('error_20', 'mean')], label="20", marker="x")
plt.scatter(dfg.index, dfg[('error_40', 'mean')], label="40", marker="x")
plt.scatter(dfg.index, dfg[('error_80', 'mean')], label="80", marker="x")
plt.legend()
plt.grid()
# plt.yscale('log')
plt.show()

In [None]:
max_len = 80

szs = []
divs = []
for sz in range(200):
    div = int((((sz - max_len) > 0) * (sz / max_len) + 1))
    divs.append(div)
    szs.append(sz // div)
    
    
# plt.plot(szs, label="size")
plt.plot(divs, label="stride")
plt.legend()
plt.grid()
plt.show()

In [None]:
# dfg = df.groupby('len').agg(['mean', 'count', 'sum'])[['error']]
# dfg.sort_values(('error',  'mean'))

In [None]:
dfg = df.groupby('sign').agg('mean')[['error']].sort_values('error', ascending=False).T
dfg

In [None]:
classes = json.load(open(DATA_PATH + "sign_to_prediction_index_map.json", "r"))
classes = list(classes.keys())

In [None]:
cm = confusion_matrix(df['target'], df['pred'], normalize=None)

In [None]:
for i in tqdm(range(len(classes))):  # gt
    for j in range(len(classes)):
        n = cm[i, j]
        if n > 50 and i != j:
            s = f"{classes[i]} predicted as {classes[j]} :".ljust(32)
            print(f"{s} {n} / {cm[i].sum()}")

In [None]:
# plt.figure(figsize=(50, 50))
# plot_confusion_matrix(df['pred'], df['target'], display_labels=None)

### Inf val

In [None]:
EXP_FOLDER = "../logs/2023-04-23/27/"   # 0.7302 / DIST 0.7324 / DISTx10 0.7327
# EXP_FOLDER = "../logs/2023-04-23/28/"   # 0.7295 / DIST 0.7341 / DISTx10 0.7348

# EXP_FOLDER = "../logs/2023-04-25/66/"  # 0.7321 / x10  0.7315  / MTx10 0.7319
EXP_FOLDER = "../logs/2023-04-25/71/" # 0.7322 / DIST 0.7352 / DISTx10 0.7353
EXP_FOLDER = "../logs/2023-04-26/0/"  # 0.7317 / DIST 0.7353 / DISTx10 0.7359
EXP_FOLDER = "../logs/2023-04-26/3/"  # 0.7338 / DIST 0.7355 / DISTx10 0.7351

# # Smaller dist
# EXP_FOLDER = "../logs/2023-04-27/14/" #  0.7319 / DIST 0.7328 /   torch_15
# EXP_FOLDER = "../logs/2023-04-27/15/" #  0.7307 / DIST 0.7304 /   torch_12
# EXP_FOLDER = "../logs/2023-04-27/17/" #  0.7322 / DIST 0.7329 /   torch_18
# EXP_FOLDER = "../logs/2023-04-27/19/" #  0.7328 / DIST 0.7334 / DISTx10 0.7341   torch_15
# # EXP_FOLDER = "../logs/2023-04-27/20/" #  0.7316 / DIST 0.7328 /   torch_12

# # EXP_FOLDER = "../logs/2023-04-28/4/"  #  0.7343 / DIST 0.7336 / DISTx10 0.7351    torch_15
# EXP_FOLDER = "../logs/2023-04-28/5/"  #  0.7321 / DIST 0.7321 / DISTx10 0.7329    torch_12
# EXP_FOLDER = "../logs/2023-04-28/6/"  #  0.7338 / DIST 0.7335 / DISTx10 0.7338    torch_15 s
# EXP_FOLDER = "../logs/2023-04-28/7/"  #  0.7342 / DIST 0.7356 / DISTx10 0.7359    torch_19

In [None]:
config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))
df = prepare_data(DATA_PATH, config.processed_folder)

if "fold" not in df.columns:
    folds = pd.read_csv(config.folds_file)
    df = df.merge(folds, how="left", on=["participant_id", "sequence_id"])
    
try:
    pred_oof = np.load(EXP_FOLDER + "pred_oof.npy")
    df['pred'] = pred_oof.argmax(-1)

    score = accuracy(df['target'], pred_oof)
    print(f"-> CV acc : {score:.4f}")
except:
    df = df[df['fold'] == 0].reset_index(drop=True)
    
    pred_oof = np.load(EXP_FOLDER + "pred_val_0.npy")
    df['pred'] = pred_oof.argmax(-1)

    score = accuracy(df['target'], pred_oof)
    print(f"-> Fold 0 acc : {score:.4f}")

In [None]:
pred_oof = kfold_inference_val(
    df,
    EXP_FOLDER,
    debug=False,
    save=True,
    use_fp16=True,
    use_mt=False,
    distilled=True,
    n_soup=1,
)

### Inf train

In [None]:
EXP_FOLDER = "../logs/2023-04-11/27/"

In [None]:
config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))
df = prepare_data(DATA_PATH, config.processed_folder)

if "fold" not in df.columns:
    folds = pd.read_csv(config.folds_file)
    df = df.merge(folds, how="left", on=["participant_id", "sequence_id"])

In [None]:
pred_oof = kfold_inference_val(
    df,
    EXP_FOLDER,
    debug=False,
    save=False,
    use_fp16=True,
    train=True
)

In [None]:
np.save(EXP_FOLDER + "pred_oof_train.npy", pred_oof)

#### Checks

In [None]:
df['pred_0'] = pred_oof[0].argmax(-1)
df_val = df[df['fold'] != 0].reset_index(drop=True)

In [None]:
df_err = df_val[(df_val['target'] != df_val['pred_0'])].reset_index(drop=True)

In [None]:
dataset = SignDataset(df_err, max_len=None, train=False)

In [None]:
for i in tqdm(range(len(dataset))):
#     i = 92284
    data = dataset[i]
    
#     for k in data.keys():
#         print(k, data[k].size())
    
    print(df_err['sequence_id'][i], "- pred :", classes[df_err['pred_0'][i]], " - truth :", df_err['sign'][i])
    plot_sample_with_edges(data, n_frames=4, figsize=(10, 10), show_text=False)

    if i > 10:
        break

Done ! 