**About** : This notebook is used to infer models.

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import torch

print(torch.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = "2"
torch.cuda.get_device_name(0)

In [None]:
import os
import re
import cv2
import sys
import glob
import json
import time
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.metrics import *

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [None]:
from utils.logger import Config, upload_to_kaggle

from params import *
from data.dataset import SignDataset
from data.preparation import *

from model_zoo.models import define_model
from utils.metrics import *
from utils.torch import load_model_weights
from utils.plots import plot_sample
# from utils.plots import plot_confusion_matrix

# from inference.predict import predict, predict_tta
# from inference.main import *

## Expes

In [None]:
EXP_FOLDERS = [
    "../logs/2023-03-29/5/",  # 384
#     "../logs/2023-03-30/3/",  # 512
    "../logs/2023-04-04/2/",  # 3 layers + interp
#     "../logs/2023-04-04/3/"   # 4 layers + interp
]

EXP_FOLDER = EXP_FOLDERS[0]

In [None]:
EXP_FOLDERS = [
    "../logs/2023-04-12/2/",
    "../logs/2023-04-12/0/",
    "../logs/2023-04-11/31/",
    "../logs/2023-04-11/29/",
]

EXP_FOLDER = EXP_FOLDERS[0]

In [None]:
config = Config(json.load(open(EXP_FOLDER + "config.json", "r")))

In [None]:
df = prepare_data(DATA_PATH, config.processed_folder)

In [None]:
if "fold" not in df.columns:
    folds = pd.read_csv(config.folds_file)
    df = df.merge(folds, how="left", on=["participant_id", "sequence_id"])

In [None]:
pred_oof = np.mean([np.load(e + "pred_oof.npy") for e in EXP_FOLDERS], 0)
df['pred'] = pred_oof.argmax(-1)

score = accuracy(df['target'], pred_oof)
print(f"-> CV acc : {score:.4f}")

## Inference

### Preprocessing

In [None]:
ROWS_PER_FRAME = 543  # number of landmarks per frame

def load_relevant_data_subset(pq_path):
    df = pd.read_parquet(pq_path)
    n_frames = int(len(df) / ROWS_PER_FRAME)
    data = df[['x', 'y', 'z']].values.reshape(n_frames, ROWS_PER_FRAME, 3)
    return df, data.astype(np.float32)

In [None]:
KEPT_LANDMARKS = [
    [468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488],  # left hand
    [522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542],  # right hand
    [10, 54, 67, 132, 150, 152, 162, 172, 176, 234, 284, 297, 361, 379, 389, 397, 400, 454],  # silhouette
    [13, 37, 40, 61, 78, 81, 84, 87, 88, 91, 191, 267, 270, 291, 308, 311, 314, 317, 318, 321, 415],  # lips
    [500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511], # arms
    [205, 425],  # cheeks
]
MAPPING = [i + 1 for i in range(len(KEPT_LANDMARKS))]

TO_AVG = [
    [466, 387, 385, 398, 263, 390, 374, 381, 362],  # left_eye
    [246, 160, 158, 173, 33, 163, 145, 154, 133],
    [383, 293, 296, 285],  # left_eyebrow
    [156, 63, 66, 55],  # right_eyebrow
    [1, 2, 98, 327, 168],  # nose
]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Preprocessing(nn.Module):
    def __init__(self, type_embed, max_len=50, model_max_len=50):
        super(Preprocessing, self).__init__()

        self.type_embed = torch.from_numpy(type_embed[None, :].astype(np.float32))
        self.type_embed = self.type_embed.repeat(1000, 1)

        self.landmark_embed = torch.tensor(np.arange(120)).float().unsqueeze(0) + 1
        self.landmark_embed = self.landmark_embed.repeat(1000, 1)
        
#         self.ids = torch.from_numpy(np.sort(np.concatenate(KEPT_LANDMARKS)))
        self.ids = torch.from_numpy(np.concatenate(KEPT_LANDMARKS))

        self.to_avg = [torch.tensor(avg) for avg in TO_AVG]

        self.hands = torch.tensor(
            [468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488] + 
            [522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542]
        )
        
        self.frames = torch.tensor(np.arange(1000) + 1)
        
        self.max_len = torch.tensor([max_len])
        self.model_max_len = torch.tensor([model_max_len])

    def filter_sign(self, x):
        hands = x[:, self.hands, 0]
        nan_prop = torch.isnan(hands).float().mean(-1)            
        x = x[torch.where(nan_prop < 1)[0]]

        length = self.frames[:x.size(0)].max().unsqueeze(0)
        sz = torch.cat([length, self.max_len]).max()
        
        divisor = (((sz - self.max_len) > 0) * (sz / self.max_len) + 1).int()
        ids = (self.frames[:x.size(0)] % divisor) == 0
        return x[ids]
    
    def forward(self, x):
        x = self.filter_sign(x)
        n_frames = x.shape[0]     
        
        avg_ids = []
        for ids in self.to_avg:
            avg_id = x[:, ids].mean(1, keepdims=True)
            avg_ids.append(avg_id)

        x = torch.cat([x[:, self.ids]] + avg_ids, 1)

        type_embed = self.type_embed[:n_frames]
        landmark_embed = self.landmark_embed[:n_frames, :x.shape[1]]
        
        # Normalize & fill nans
        nonan = x[~torch.isnan(x)].view(-1, x.shape[-1])
        x = x - nonan.mean(0)[None, None, :]
        x = x / nonan.std(0, unbiased=False)[None, None, :]
        x[torch.isnan(x)] = 0

        # Concat
        x = torch.cat([
            type_embed.unsqueeze(-1), x, landmark_embed.unsqueeze(-1)
        ], -1).transpose(1, 2)
        
        x = x[:self.model_max_len]
        
        return x

In [None]:
landmarks = np.concatenate(KEPT_LANDMARKS)
type_embed = np.zeros(1000)
start = 0
for subset, idx in zip(KEPT_LANDMARKS, MAPPING):
    print(subset, idx)
    type_embed[start: start + len(subset)] = idx
    start += len(subset)

type_embed = type_embed[type_embed > 0]

type_embed = np.concatenate([type_embed, np.array([idx] * len(TO_AVG))])

print("\nn_landmarks :", len(type_embed))

### Model

In [None]:
import onnx
import onnx_tf
import tensorflow as tf
import onnxruntime as rt
import tflite_runtime.interpreter as tflite

from onnx_tf.backend import prepare

In [None]:
from torch.nn import LayerNorm
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout

class DebertaV2Output(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.output_size)
        self.LayerNorm = LayerNorm(config.output_size, config.layer_norm_eps)
        self.dropout = StableDropout(config.hidden_dropout_prob)
        self.config = config

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states + input_tensor)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states

In [None]:
import torch
import torch.nn as nn
from transformers import AutoConfig
from model_zoo.deberta import DebertaV2Encoder
from model_zoo.utils import add_shift
# from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2Encoder
    
class Model(nn.Module):
    """
    Model with an attention mechanism.
    """
    def __init__(
        self,
        type_embed,
        embed_dim=256,
        dense_dim=384,
        transfo_dim=768,
        transfo_layers=3,
        transfo_heads=1,
        num_classes=250,
        drop_rate=0,
        n_landmarks=100,
        max_len=50,
    ):
        """
        Constructor.

        Args:
            encoder (timm model): Encoder.
            num_classes (int, optional): Number of classes. Defaults to 1.
            num_classes_aux (int, optional): Number of aux classes. Defaults to 0.
            n_channels (int, optional): Number of image channels. Defaults to 3.
        """
        super().__init__()
        self.num_classes = num_classes
        self.num_classes_aux = 0
        self.transfo_heads = transfo_heads

        self.type_embed = nn.Embedding(9, embed_dim, padding_idx=0)
        self.landmark_embed = nn.Embedding(101, embed_dim, padding_idx=0)
        self.type_norm = nn.LayerNorm(embed_dim)
        self.landmark_norm = nn.LayerNorm(embed_dim)

        self.pos_dense = nn.Linear(9, embed_dim)
        self.dense = nn.Linear(3 * embed_dim, embed_dim)
        
        self.left_hand_mlp = nn.Sequential(
            nn.Linear(embed_dim * 21, dense_dim), 
            nn.BatchNorm1d(dense_dim),
            nn.Dropout(p=drop_rate),
            nn.LeakyReLU(),
        )

        self.right_hand_mlp = nn.Sequential(
            nn.Linear(embed_dim * 21 , dense_dim),
            nn.BatchNorm1d(dense_dim),
            nn.Dropout(p=drop_rate),
            nn.LeakyReLU(),
        )

        self.lips_mlp = nn.Sequential(
            nn.Linear(embed_dim * 21, dense_dim),
            nn.BatchNorm1d(dense_dim),
            nn.Dropout(p=drop_rate),
            nn.LeakyReLU(),
        )
        
        self.face_mlp = nn.Sequential(
            nn.Linear(embed_dim * 25, dense_dim),
            nn.BatchNorm1d(dense_dim),
            nn.Dropout(p=drop_rate),
            nn.LeakyReLU(),
        )
        
        self.full_mlp = nn.Sequential(
            nn.Linear(embed_dim * n_landmarks, dense_dim),
            nn.BatchNorm1d(dense_dim),
            nn.Dropout(p=drop_rate),
            nn.LeakyReLU(),
        )
        
        delta = 128 if transfo_dim < 1000 else 256
        transfo_dim -= delta * (transfo_layers - 1)

        self.landmark_mlp = nn.Sequential(
            nn.Linear(dense_dim * 4, transfo_dim),
            nn.BatchNorm1d(transfo_dim),
            nn.Dropout(p=drop_rate),
            nn.LeakyReLU(),
        )
        
        name = "microsoft/deberta-v3-base"
        
        config = AutoConfig.from_pretrained(name, output_hidden_states=True)
        config.hidden_size = transfo_dim
        config.intermediate_size = transfo_dim
        config.output_size = transfo_dim
        if transfo_layers >= 2:
            config.output_size = transfo_dim + delta
        config.num_hidden_layers = 1
        config.num_attention_heads = transfo_heads
        config.attention_probs_dropout_prob = drop_rate
        config.hidden_dropout_prob = drop_rate
        config.hidden_act = "relu"
        config.max_relative_positions = max_len
        config.position_buckets = max_len
        config.max_len = max_len

        self.frame_transformer_1 = DebertaV2Encoder(config)
        self.frame_transformer_1.layer[0].output = DebertaV2Output(config)

        self.frame_transformer_2 = None
        if transfo_layers >= 2:
            config.hidden_size += delta
            config.intermediate_size += delta
            if transfo_layers >= 3:
                config.output_size += delta
            config.attention_probs_dropout_prob *= 2
            config.hidden_dropout_prob *= 2
            self.frame_transformer_2 = DebertaV2Encoder(config)
            self.frame_transformer_2.layer[0].output = DebertaV2Output(config)

        self.frame_transformer_3 = None
        if transfo_layers >= 3:
            config.hidden_size += delta
            config.intermediate_size += delta
            config.attention_probs_dropout_prob *= 2
            config.hidden_dropout_prob *= 2
            self.frame_transformer_3 = DebertaV2Encoder(config)
            self.frame_transformer_3.layer[0].output = DebertaV2Output(config)

        self.logits = nn.Linear(config.output_size, num_classes)
        
#         self.preprocess = Preprocessing(type_embed, max_len=min(50, max_len), model_max_len=max_len)

    
    def forward(self, x):
        """
        Forward function.

        Args:
            x (torch tensor [batch_size x c x h x w]): Input batch.
            return_fts (bool, Optional): Whether to return encoder features.

        Returns:
            torch tensor [batch_size x num_classes]: logits.
            torch tensor [batch_size x num_classes_aux]: logits aux.
            torch tensor [batch_size x num_features]: Encoder features, if return_fts.
        """
#         x = self.preprocess(x).unsqueeze(0)

        bs, n_frames, n_landmarks, _ = x.size()    
#         mask = ((x != 0).sum(-1).sum(-1) != 0).float()

        x_type = self.type_norm(self.type_embed(x[:, :, 0].long()))
        x_landmark = self.landmark_norm(self.landmark_embed(x[:, :, 4].long()))
        
        x_pos = x[:, :, 1:4].transpose(2, 3).contiguous()
        
        x_pos = add_shift(x_pos)
        x_pos = self.pos_dense(x_pos)

        fts = self.dense(torch.cat([x_type, x_landmark, x_pos], -1))

        n_fts = fts.size(-1)
        embed = x[:, 0, 0].unsqueeze(1).repeat(1, n_frames, 1).view(-1).long()

        left_hand_fts = fts.view(-1, n_fts)[embed == 1].view(bs, n_frames, -1, n_fts)
        left_hand_fts = self.left_hand_mlp(left_hand_fts.view(bs * n_frames, -1))

        right_hand_fts = fts.view(-1, n_fts)[embed == 2].view(bs, n_frames, -1, n_fts)
        right_hand_fts = self.right_hand_mlp(right_hand_fts.view(bs * n_frames, -1))
        
        hand_fts = torch.stack([left_hand_fts, right_hand_fts], -1).max(-1).values

        lips_fts = fts.view(-1, n_fts)[embed == 4].view(bs, n_frames, -1, n_fts)
        lips_fts = self.lips_mlp(lips_fts.view(bs * n_frames, -1))

        face_fts = fts.view(-1, n_fts)[(embed == 3) | (embed == 6)].view(bs, n_frames, -1, n_fts)
        face_fts = self.face_mlp(face_fts.view(bs * n_frames, -1))

#         fts = fts.view(-1, n_fts).view(bs, n_frames, -1, n_fts)
        fts = fts.view(bs * n_frames, -1)
    
        fts = self.full_mlp(fts)

        fts = torch.cat([fts, hand_fts, lips_fts, face_fts], -1)

        fts = self.landmark_mlp(fts)
        fts = fts.view(bs, n_frames, -1)

        fts = self.frame_transformer_1(fts).last_hidden_state
        if self.frame_transformer_2 is not None:
            fts = self.frame_transformer_2(fts).last_hidden_state
        if self.frame_transformer_3 is not None:
            fts = self.frame_transformer_3(fts).last_hidden_state

        fts = fts.mean(1)
        logits = self.logits(fts)
        return logits

In [None]:
prepro = Preprocessing(type_embed, max_len=config.max_len, model_max_len=config.max_len)

In [None]:
EXP_FOLDERS = [
    "../logs/2023-04-12/0/",
    "../logs/2023-04-11/29/",
    "../logs/2023-04-09/0/",
]

# EXP_FOLDERS = [
#     "../logs/2023-04-12/2/",
#     "../logs/2023-04-12/0/",
#     "../logs/2023-04-11/31/",
#     "../logs/2023-04-11/29/",
# ]

EXP_FOLDER = EXP_FOLDERS[0]

df = prepare_data()

if "fold" not in df.columns:
    folds = pd.read_csv("../input/folds_4.csv")
    df = df.merge(folds, how="left", on=["participant_id", "sequence_id"])
    
pred_oof = np.mean([np.load(e + "pred_oof.npy") for e in EXP_FOLDERS], 0)
df['pred'] = pred_oof.argmax(-1)

score = accuracy(df['target'], pred_oof)
print(f"-> CV acc : {score:.4f}")

In [None]:
models = []

for exp_folder in EXP_FOLDERS:
#     print(exp_folder)
    config = Config(json.load(open(exp_folder + "config.json", "r")))

    model = Model(
        type_embed,
        embed_dim=config.embed_dim,
        dense_dim=config.dense_dim,
        transfo_dim=config.transfo_dim,
        transfo_layers=config.transfo_layers,
        transfo_heads=config.transfo_heads,
        drop_rate=config.drop_rate,
        num_classes=config.num_classes,
        max_len=config.max_len,
    ).cpu().eval()

    model = load_model_weights(model, exp_folder + f"{config.name}_fullfit_0.pt")
    # model_1 = load_model_weights(model_1, EXP_FOLDER_1 + f"{config.name}_0.pt")
    
    models.append(model)

In [None]:
class EnsembleModel(nn.Module):
    def __init__(self, prepro, models):
        super().__init__()
        self.prepro = prepro
        self.models = nn.ModuleList(models)
    
    def forward(self, x):
        x = self.prepro(x).unsqueeze(0)
        
        
        ys = [model(x).softmax(-1) for model in self.models]
        
        return torch.stack(ys, -1).mean(-1)

In [None]:
model = EnsembleModel(prepro, models)
model = model.cpu().eval()

In [None]:
preds = []
times = []

# for i in tqdm(range(len(df['path']))):
for i in tqdm(range(100)):
    path = df['path'][i]
    name = f"{path.split('/')[-2]}_{path.split('/')[-1].split('.')[0]}.npy"

    pq, data = load_relevant_data_subset(path)
    
    x = torch.from_numpy(data)

    t0 = time.time()
    y = model(x)
    preds.append(y.detach().cpu().numpy().flatten())
    t1 = time.time()
    
    times.append((t1 - t0) * 1000)

In [None]:
# rel_pos = torch.from_numpy(np.load('rel_pos.npy').astype(np.int32))
# rel_pos.size()

In [None]:
print(f'Runtime : {np.mean(times) :.1f}ms')

In [None]:
preds = np.stack(preds)

In [None]:
accuracy(df['target'].head(len(preds)), preds)

In [None]:
# accuracy(df['target'].head(len(preds)), pred_val[:len(preds)])

In [None]:
y.max()

#### ONNX

In [None]:
ENS_NAME = "1b2s"

OUT_FOLDER = "../output/ens/" + ENS_NAME + "/"
os.makedirs(OUT_FOLDER, exist_ok=True)

In [None]:
def convert_to_onnx(model, config, onnx_file="model.onnx"): 
    torch.onnx.export(
        model,
        torch.zeros((100, 543, 3)),               # model input (or a tuple for multiple inputs)
        onnx_file,                                # where to save the model (can be a file or file-like object)
        export_params=True,                       # store the trained parameter weights inside the model file
        opset_version=12,                         # the ONNX version to export the model to
        do_constant_folding=True,                 # whether to execute constant folding for optimization 
        input_names=['inputs'],                   # the model's input names
        output_names=['outputs'],                 # the model's output names
        dynamic_axes={'inputs': {0: 'length'},},
        verbose = True,
    )

In [None]:
convert_to_onnx(model, config, OUT_FOLDER + "model.onnx")

In [None]:
m = rt.InferenceSession(OUT_FOLDER + "model.onnx", providers=['CPUExecutionProvider'])
onnx_pred = m.run(["outputs"], {"inputs": data})
onnx_pred[0].max()

#### Tf

In [None]:
onnx_model = onnx.load(OUT_FOLDER + "model.onnx")
tf_rep = prepare(onnx_model)

### TfLite

In [None]:
tf_rep.export_graph(OUT_FOLDER + "model_tf")

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model(OUT_FOLDER + "model_tf")

converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]

tflite_model = converter.convert()

with open(OUT_FOLDER + 'model.tflite', 'wb') as f:
    f.write(tflite_model)

In [None]:
interpreter = tflite.Interpreter(OUT_FOLDER + "model.tflite")

prediction_fn = interpreter.get_signature_runner("serving_default")

output = prediction_fn(inputs=data)
output['outputs'].max()

In [None]:
preds = []
times = []
# for i in tqdm(range(len(df['path']))):
for i in tqdm(range(100)):
    path = df['path'][i]
    name = f"{path.split('/')[-2]}_{path.split('/')[-1].split('.')[0]}.npy"

    pq, data = load_relevant_data_subset(path)

    t0 = time.time()
    output = prediction_fn(inputs=data)
    t1 = time.time()

    preds.append(output['outputs'])
    times.append((t1 - t0) * 1000)
    
#     break

In [None]:
accuracy(df['target'].head(len(preds)), preds)

In [None]:
print(f'-> Runtime : {np.mean(times) :.1f}ms')

if np.mean(times) > 100:
    print("\n WARNING ! Runtime must be < 100 ms !")

### Size & upload

In [None]:
size = os.path.getsize(OUT_FOLDER + 'model.tflite') / np.power(1024, 2)
print(f"-> Model size : {size:.3f} Mo")

assert size < 40, "Model size must be < 40 Mo !"

In [None]:
upload_to_kaggle([OUT_FOLDER], "/workspace/datasets/islr_weights_1/", "ISLR Models", update_folders=False)

In [None]:
# !ls /workspace/datasets/islr_weights_1/

Done ! 