**About** : This notebook is used to infer models.

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import torch

print(torch.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
torch.cuda.get_device_name(0)

In [None]:
import os
import re
# import cv2
import sys
import glob
import json
import time
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.metrics import *

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
pd.set_option('display.max_columns', 500)

In [None]:
from utils.logger import Config, upload_to_kaggle

from params import *
from data.dataset import SignDataset
from data.preparation import *

from utils.metrics import *
from utils.plots import plot_sample
from utils.plots import *

### Openhands

In [None]:
# pip install omegaconf torchmetrics pytorch_lightning pytorchvideo hydra-core natsort

In [None]:
model = SLGCN()

In [None]:
df = prepare_data(DATA_PATH, processed_folder="../input/openhands/")

df = df[df['sign'] == "shower"]

dataset = SignDataset(df, max_len=30, train=False)

BS = 256

x = {}
# batch = [dataset[idx] for idx in range(BS)]  # 
batch = [dataset[idx] for idx in np.random.randint(len(dataset), size=BS)]
for k in batch[0]:
    x[k] = torch.cat([d[k].unsqueeze(0) for d in batch])  # .cuda()

In [None]:
y, out_torch = model(x)

In [None]:
y.size()

In [None]:
pred = y.detach().numpy().argmax(-1)
gt = x['target'].numpy().flatten()

In [None]:
gt_classes = np.array(classes)[gt.astype(int)]

In [None]:
wsasl_classes = pd.read_json('../input/wlasl/WLASL_v0.3.json')['gloss'].values
pred_classes = wsasl_classes[pred.astype(int)]

In [None]:
(pred_classes == gt_classes).sum()

In [None]:
for idx in np.where(pred_classes == gt_classes)[0]:
    d = out_torch[idx]
    data = {
        "x": d[0],
        "y": d[1],
        "type": torch.tensor([1] * 7 + [2] * 10 + [3] * 10).unsqueeze(0).repeat(d.size(1), 1)
    }

    plot_sample(data, n_frames=9, figsize=(10, 10))

In [None]:
d = out_torch[0]
data = {
    "x": d[0],
    "y": d[1],
    "type": torch.tensor([1] * 7 + [2] * 10 + [3] * 10).unsqueeze(0).repeat(d.size(1), 1)
}

plot_sample(data, n_frames=9, figsize=(10, 10))

#### Ref

In [None]:
model = OpenHandModel(omegaconf.OmegaConf.load("../input/weights/st_gcn/config.yaml"))

In [None]:
model.init_from_checkpoint_if_available(verbose=1)

In [None]:
preds, truths = model.test_inference()

In [None]:
(preds.argmax(-1) == truths).mean()

In [None]:
dataloader = model.datamodule.test_dataloader()
batch = next(iter(dataloader))

In [None]:
idx = 15
out_torch = batch['frames'][idx]

gt = wsasl_classes[batch['labels'][idx]]
print(gt, gt in classes)

data = {
    "x": out_torch[0],
    "y": out_torch[1],
    "type": torch.tensor([1] * 7 + [2] * 10 + [3] * 10).unsqueeze(0).repeat(out_torch.size(1), 1)
}

plot_sample(data, n_frames=9, figsize=(10, 10))

## Expes

In [None]:
# EXP_FOLDER = "../logs/2023-03-16/42/"
EXP_FOLDER = "../logs/2023-03-25/6/"

In [None]:
config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))

In [None]:
df = prepare_data(DATA_PATH, config.processed_folder)

In [None]:
if "fold" not in df.columns:
    folds = pd.read_csv(config.folds_file)
    df = df.merge(folds, how="left", on=["participant_id", "sequence_id"])

In [None]:
len(df)

In [None]:
pred_oof = np.load(EXP_FOLDER + "pred_oof.npy")
score = accuracy(df['target'], pred_oof)
print(f"-> CV acc : {score:.4f}")

In [None]:
df['pred'] = pred_oof.argmax(-1)

In [None]:
df['error'] = (df['target'] != df['pred'])

In [None]:
dfg = df.groupby('participant_id').agg(['mean', 'count', 'sum'])[['error']]

In [None]:
dfg.sort_values(('error',  'mean'))

In [None]:
dfg = df.groupby('sign').agg('mean')[['error']].sort_values('error', ascending=False).T
dfg

In [None]:
classes = json.load(open(DATA_PATH + "sign_to_prediction_index_map.json", "r"))
classes = list(classes.keys())

In [None]:
cm = confusion_matrix(df['target'], df['pred'], normalize=None)

In [None]:
(df['target'] == 0).sum()

In [None]:
65000 / len(df)

In [None]:
path_to_glove_file = "../input/glove.6B.50d.txt"

embeddings_index = {}
with open(path_to_glove_file, "rb") as f:
    for line in tqdm(f):
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word.decode("utf-8")] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
import spacy
nlp = spacy.load('en_core_web_md')

rep = {
    "callonphone": 'phonecall',
    "frenchfries": 'fries',
    "glasswindow": 'window',
    "hesheit": 'he',
    "minemy": 'my',
    "weus": 'we',
    "haveto": "have",
    "owie": "bruise",
}

embed = []
for c in tqdm(classes):
    c = rep.get(c, c)
#     vec = nlp(c).vector
    vec = embeddings_index[c.lower()]
    embed.append(vec)
embed = np.array(embed)

In [None]:
np.save('../output/embed.npy', embed)

In [None]:
dists = np.sqrt((embed[:, None] - embed[None]) ** 2).mean(-1)

# dists = (embed[:, None] * embed[None]).sum(-1) / ((embed[None] ** 2).sum(-1) * (embed[:, None] ** 2).sum(-1))

In [None]:
for i in tqdm(range(len(classes))):  # gt
    order = np.argsort(dists[i])
    print(np.array(classes)[order[:5]])
#     break

In [None]:
order

In [None]:
for i in tqdm(range(len(classes))):  # gt
    for j in range(len(classes)):
        n = cm[i, j]
        if n > 50 and i != j:
            s = f"{classes[i]} predicted as {classes[j]} :".ljust(32)
            print(f"{s} {n} / {cm[i].sum()}")

In [None]:
cm[0]

In [None]:
def plot_confusion_matrix(
    y_pred,
    y_true,
    cm=None,
    normalize="true",
    display_labels=None,
    cmap="viridis",
):
    """
    Computes and plots a confusion matrix.
    Args:
        y_pred (numpy array): Predictions.
        y_true (numpy array): Truths.
        normalize (bool or None, optional): Whether to normalize the matrix. Defaults to None.
        display_labels (list of strings or None, optional): Axis labels. Defaults to None.
        cmap (str, optional): Colormap name. Defaults to "viridis".
    """
    if cm is None:
        cm = confusion_matrix(y_true, y_pred, normalize=normalize)
#     cm = cm[::-1, :]

    # Display colormap
    n_classes = cm.shape[0]
    im_ = plt.imshow(cm, interpolation="nearest", cmap=cmap)

    # Display values
    cmap_min, cmap_max = im_.cmap(0), im_.cmap(256)
    thresh = (cm.max() + cm.min()) / 2.0
    for i in tqdm(range(n_classes)):
        for j in range(n_classes):
            if cm[i, j] > 0.1:
                color = cmap_max if cm[i, j] < thresh else cmap_min
                text = f"{cm[i, j]:.0f}" if normalize is None else f"{cm[i, j]:.1f}"
                plt.text(j, i, text, ha="center", va="center", color=color)

    # Display legend
    plt.xlim(-0.5, n_classes - 0.5)
    plt.ylim(-0.5, n_classes - 0.5)
    if display_labels is not None:
        plt.xticks(np.arange(n_classes), display_labels)
        plt.yticks(np.arange(n_classes), display_labels)

    plt.ylabel("True label", fontsize=12)
    plt.xlabel("Predicted label", fontsize=12)

In [None]:
plt.figure(figsize=(50, 50))
plot_confusion_matrix(df['pred'], df['target'], display_labels=None)

Done ! 