**About** : This notebook is used to validate models.

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import torch

print(torch.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
torch.cuda.get_device_name(0)

In [None]:
import os
import re
import sys
import glob
import json
import time
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.metrics import *

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
pd.set_option('display.max_columns', 500)

In [None]:
from params import *

from data.preparation import *
from data.dataset import SignDataset

from utils.metrics import *
from utils.plots import *
from utils.logger import Config

from inference.main import kfold_inference_val

## Expes

In [None]:
# EXP_FOLDER = "../logs/2023-03-16/42/"
# EXP_FOLDER = "../logs/2023-03-25/6/"
# EXP_FOLDER = "../logs/2023-03-29/5/"
EXP_FOLDER = "../logs/2023-03-30/3/"
EXP_FOLDER = "../logs/2023-04-06/10/"
EXP_FOLDER = "../logs/2023-04-07/12/"
EXP_FOLDER = "../logs/2023-04-09/0/"

EXP_FOLDER = "../logs/2023-04-13/22/"

In [None]:
config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))

In [None]:
df = prepare_data(DATA_PATH, config.processed_folder)

In [None]:
# EXP_FOLDERS = [
#     "../logs/2023-04-12/0/",
#     "../logs/2023-04-11/29/",
#     "../logs/2023-04-12/4/",
#     "../logs/2023-04-12/6/",
# ]

# pred_oof = np.mean([np.load(e + "pred_oof.npy") for e in EXP_FOLDERS], 0)
# df['pred'] = pred_oof.argmax(-1)

# score = accuracy(df['target'], pred_oof)
# print(f"-> CV acc : {score:.4f}")

In [None]:
# EXP_FOLDERS = [
#     "../logs/2023-04-12/2/",
#     "../logs/2023-04-12/0/",
# #     "../logs/2023-04-11/31/",
#     "../logs/2023-04-11/29/",
# #     "../logs/2023-04-11/28/",
# #     "../logs/2023-04-11/26/",
# ]

# pred_oof = np.mean([np.load(e + "pred_oof.npy") for e in EXP_FOLDERS], 0)
# df['pred'] = pred_oof.argmax(-1)

# score = accuracy(df['target'], pred_oof)
# print(f"-> CV acc : {score:.4f}")

In [None]:
if "fold" not in df.columns:
    folds = pd.read_csv(config.folds_file)
    df = df.merge(folds, how="left", on=["participant_id", "sequence_id"])

In [None]:
pred_oof = np.load(EXP_FOLDER + "pred_oof.npy")
df['pred'] = pred_oof.argmax(-1)

score = accuracy(df['target'], pred_oof)
print(f"-> CV acc : {score:.4f}")

In [None]:
# df = df[df['participant_id'] != 29302].reset_index(drop=True)
# score = accuracy(df['target'], df['pred'])
# print(f"-> CV acc : {score:.4f}")

In [None]:
df['error'] = (df['target'] != df['pred'])

In [None]:
dfg = df.groupby('participant_id').agg(['mean', 'count', 'sum'])[['error']]

In [None]:
dfg.sort_values(('error',  'mean'))

In [None]:
dfg = df.groupby('sign').agg('mean')[['error']].sort_values('error', ascending=False).T
dfg

In [None]:
classes = json.load(open(DATA_PATH + "sign_to_prediction_index_map.json", "r"))
classes = list(classes.keys())

In [None]:
cm = confusion_matrix(df['target'], df['pred'], normalize=None)

In [None]:
for i in tqdm(range(len(classes))):  # gt
    for j in range(len(classes)):
        n = cm[i, j]
        if n > 50 and i != j:
            s = f"{classes[i]} predicted as {classes[j]} :".ljust(32)
            print(f"{s} {n} / {cm[i].sum()}")

In [None]:
# plt.figure(figsize=(50, 50))
# plot_confusion_matrix(df['pred'], df['target'], display_labels=None)

### Explo

In [None]:
# path_to_glove_file = "../input/glove/glove.6B.50d.txt"

# embeddings_index = {}
# with open(path_to_glove_file, "rb") as f:
#     for line in tqdm(f):
#         word, coefs = line.split(maxsplit=1)
#         coefs = np.fromstring(coefs, "f", sep=" ")
#         embeddings_index[word.decode("utf-8")] = coefs

# print("Found %s word vectors." % len(embeddings_index))

In [None]:
# import spacy
# nlp = spacy.load('en_core_web_md')

# rep = {
#     "callonphone": 'phonecall',
#     "frenchfries": 'fries',
#     "glasswindow": 'window',
#     "hesheit": 'he',
#     "minemy": 'my',
#     "weus": 'we',
#     "haveto": "have",
#     "owie": "bruise",
# }

# embed = []
# for c in tqdm(classes):
#     c = rep.get(c, c)
# #     vec = nlp(c).vector
#     vec = embeddings_index[c.lower()]
#     embed.append(vec)
# embed = np.array(embed)

# # np.save('../output/embed.npy', embed)

In [None]:
# dists = np.sqrt((embed[:, None] - embed[None]) ** 2).mean(-1)

# # dists = (embed[:, None] * embed[None]).sum(-1) / ((embed[None] ** 2).sum(-1) * (embed[:, None] ** 2).sum(-1))

In [None]:
# for i in tqdm(range(len(classes))):  # gt
#     order = np.argsort(dists[i])
#     print(np.array(classes)[order[:5]])
# #     break

### Inf val

In [None]:
EXP_FOLDER = "../logs/2023-04-13/22/"

In [None]:
config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))
df = prepare_data(DATA_PATH, config.processed_folder)

if "fold" not in df.columns:
    folds = pd.read_csv(config.folds_file)
    df = df.merge(folds, how="left", on=["participant_id", "sequence_id"])
    
try:
    pred_oof = np.load(EXP_FOLDER + "pred_oof.npy")
    df['pred'] = pred_oof.argmax(-1)

    score = accuracy(df['target'], pred_oof)
    print(f"-> CV acc : {score:.4f}")
except:
    df = df[df['fold'] == 0].reset_index(drop=True)
    
    pred_oof = np.load(EXP_FOLDER + "pred_val_0.npy")
    df['pred'] = pred_oof.argmax(-1)

    score = accuracy(df['target'], pred_oof)
    print(f"-> Fold 0 acc : {score:.4f}")

In [None]:
pred_oof = kfold_inference_val(
    df,
    EXP_FOLDER,
    debug=False,
    save=False,
    use_fp16=True,
    use_mt=False,
)

In [None]:
# pred = (pred_oof + np.load(EXP_FOLDER + "pred_val_0.npy")) / 2
# accuracy(df['target'], pred)

# df_val['pred_flip'] = pred.argmax(-1)
# df_val['error_flip'] = (df_val['pred_flip'] != df_val['target'])

# dfg = df_val.groupby('sign')[["error", "error_flip"]].mean()
# dfg["delta"] = dfg["error"] - dfg["error_flip"]

# dfg.sort_values('delta').T

# [classes.index(c) for c in dfg[dfg['delta'] < 0.01].index]

# [classes.index(c) for c in dfg[dfg['delta'] < -0.05].index]

### Inf train

In [None]:
EXP_FOLDER = "../logs/2023-04-11/27/"

In [None]:
config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))
df = prepare_data(DATA_PATH, config.processed_folder)

if "fold" not in df.columns:
    folds = pd.read_csv(config.folds_file)
    df = df.merge(folds, how="left", on=["participant_id", "sequence_id"])

In [None]:
pred_oof = kfold_inference_val(
    df,
    EXP_FOLDER,
    debug=False,
    save=False,
    use_fp16=True,
    train=True
)

In [None]:
np.save(EXP_FOLDER + "pred_oof_train.npy", pred_oof)

#### Checks

In [None]:
df['pred_0'] = pred_oof[0].argmax(-1)
df_val = df[df['fold'] != 0].reset_index(drop=True)

In [None]:
df_err = df_val[(df_val['target'] != df_val['pred_0'])].reset_index(drop=True)

In [None]:
dataset = SignDataset(df_err, max_len=None, train=False)

In [None]:
for i in tqdm(range(len(dataset))):
#     i = 92284
    data = dataset[i]
    
#     for k in data.keys():
#         print(k, data[k].size())
    
    print(df_err['sequence_id'][i], "- pred :", classes[df_err['pred_0'][i]], " - truth :", df_err['sign'][i])
    plot_sample_with_edges(data, n_frames=4, figsize=(10, 10), show_text=False)

    if i > 10:
        break

Done ! 