<a href="https://www.kaggle.com/code/regisvargas/warning-beginner-s-notebook?scriptVersionId=130451859" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Notebooks I'm using as references:** 

[LB 0.67] one pytorch transformer solution<br>
Z by HP Data Science Global Ambassador<br>
https://www.kaggle.com/code/hengck23/lb-0-67-one-pytorch-transformer-solution


ASLFR EDA + Preprocessing Dataset<br>
MARK WIJKHUIZEN<br>
https://www.kaggle.com/code/markwijkhuizen/aslfr-eda-preprocessing-dataset

**Discussions I'm using as references:**

CV Leaderboard<br>
MARK WIJKHUIZEN<br>
https://www.kaggle.com/competitions/asl-fingerspelling/discussion/411060

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sn
import tensorflow as tf

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, GroupShuffleSplit 

import glob
import sys
import os
import math
import gc
import sys
import sklearn
import time
import json

# TQDM Progress Bar With Pandas Apply Function
tqdm.pandas()



In [2]:
# If Notebook Is Run By Committing or In Interactive Mode For Development
IS_INTERACTIVE = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive'
# Describe Statistics Percentiles
PERCENTILES = [0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99, 0.999]
# Global Random Seed
SEED = 42
# Number of Frames to resize recording to
N_TARGET_FRAMES = 256
# Global debug flag, takes subset of train
DEBUG = False

In [3]:
train = pd.read_csv('/kaggle/input/asl-fingerspelling/train.csv').head(5000)

In [4]:
# Get complete file path to file
def get_file_path(path):
    return f'/kaggle/input/asl-fingerspelling/{path}'

train['file_path'] = train['path'].apply(get_file_path)

In [5]:
def get_idxs(df, words_pos, words_neg=[], ret_names=True):
    idxs = []
    names = []
    for col_idx, col in enumerate(df.columns):
        # Check if column name contains all words
        if all([w in col for w in words_pos]) and all([w not in col for w in words_neg]):
            idxs.append(col_idx)
            names.append(col)
    # Convert to Numpy arrays
    idxs = np.array(idxs)
    names = np.array(names)
    # Returns either both column indices and names
    if ret_names:
        return idxs, names
    # Or only columns indices
    else:
        return idxs

In [6]:
# Read First Parquet File
example_parquet_df = pd.read_parquet(train['file_path'][0])

# Each parquet file contains 1000 recordings
print(f'# Unique Recording: {example_parquet_df.index.nunique()}')
# Display DataFrame layout
display(example_parquet_df.head())

# Unique Recording: 1000


Unnamed: 0_level_0,frame,x_face_0,x_face_1,x_face_2,x_face_3,x_face_4,x_face_5,x_face_6,x_face_7,x_face_8,...,z_right_hand_11,z_right_hand_12,z_right_hand_13,z_right_hand_14,z_right_hand_15,z_right_hand_16,z_right_hand_17,z_right_hand_18,z_right_hand_19,z_right_hand_20
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1816796431,0,0.710588,0.699951,0.705657,0.691768,0.699669,0.70198,0.709724,0.610405,0.71266,...,-0.245855,-0.269148,-0.129743,-0.251501,-0.278687,-0.26653,-0.152852,-0.257519,-0.275822,-0.266876
1816796431,1,0.709525,0.697582,0.703713,0.691016,0.697576,0.700467,0.709796,0.61654,0.713729,...,,,,,,,,,,
1816796431,2,0.711059,0.700858,0.706272,0.693285,0.700825,0.703319,0.711549,0.615606,0.715143,...,,,,,,,,,,
1816796431,3,0.712799,0.702518,0.70784,0.694899,0.702445,0.704794,0.712483,0.625044,0.715677,...,-0.37077,-0.408097,-0.185217,-0.325494,-0.343373,-0.328294,-0.203126,-0.315719,-0.326104,-0.314282
1816796431,4,0.712349,0.705451,0.709918,0.696006,0.70518,0.706928,0.712685,0.614356,0.714875,...,,,,,,,,,,


In [7]:
# Landmark Indices for Left/Right hand without z axis in raw data
LEFT_HAND_IDXS0, LEFT_HAND_NAMES0 = get_idxs(example_parquet_df, ['left_hand'], ['z'])
RIGHT_HAND_IDXS0, RIGHT_HAND_NAMES0 = get_idxs(example_parquet_df, ['right_hand'], ['z'])
COLUMNS = np.concatenate((LEFT_HAND_NAMES0, RIGHT_HAND_NAMES0))
N_COLS0 = len(COLUMNS)
# Only X/Y axes are used
N_DIMS0 = 2

print(f'N_COLS0: {N_COLS0}')

N_COLS0: 84


In [8]:
N_SAMPLES = len(train)
N_COLS0 = len(COLUMNS)

In [9]:
"""
    Tensorflow layer to process data in TFLite
    Data needs to be processed in the model itself, so we can not use Python
""" 
class PreprocessLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(PreprocessLayer, self).__init__()
        self.normalisation_correction = tf.constant(
                    # Add 0.50 to x coordinates of left hand (original right hand) and substract 0.50 of right hand (original left hand)
                     [0.50 if 'x' in name else 0.00 for name in LEFT_HAND_NAMES0],
                dtype=tf.float32,
            )
    
    @tf.function(
        input_signature=(tf.TensorSpec(shape=[None,N_COLS0], dtype=tf.float32),),
    )
    def call(self, data0):
        # Number of Frames in Video
        N_FRAMES0 = tf.shape(data0)[0]
        
        # Find dominant hand
        left_hand_sum = tf.math.reduce_sum(tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS, axis=1)), 0, 1))
        right_hand_sum = tf.math.reduce_sum(tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS, axis=1)), 0, 1))
        left_dominant = left_hand_sum >= right_hand_sum
        
        # Count non NaN Hand values in each frame
        if left_dominant:
            frames_hands_non_nan_sum = tf.math.reduce_sum(
                    tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS, axis=1)), 0, 1),
                    axis=[1],
                )
        else:
            frames_hands_non_nan_sum = tf.math.reduce_sum(
                    tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS, axis=1)), 0, 1),
                    axis=[1],
                )
        # Frames With Coordinates for hand
        non_empty_frames_idxs = tf.where(frames_hands_non_nan_sum > 0)
        non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
        # Filter data on frames with coordinates for hand
        data = tf.gather(data0, non_empty_frames_idxs, axis=0)
        
        # Cast Indices in float32 to be compatible with Tensorflow Lite
        non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
        # Normalize to start with 0
        non_empty_frames_idxs -= tf.reduce_min(non_empty_frames_idxs)
        
        # Number of Frames in Filtered Video
        N_FRAMES = tf.shape(data)[0]
        
        # Gather Relevant Landmark Columns
        if left_dominant:
            data = tf.gather(data, LEFT_HAND_IDXS, axis=1)
        else:
            data = tf.gather(data, RIGHT_HAND_IDXS, axis=1)
            data = (
                    self.normalisation_correction + (
                        (data - self.normalisation_correction) * tf.where(self.normalisation_correction != 0, -1.0, 1.0))
                )
            
        # Fill NaN Values With 0
        data = tf.where(tf.math.is_nan(data), 0.0, data)
        # Resize Video
        data = tf.image.resize(
            data[:,:,tf.newaxis],
            [N_TARGET_FRAMES, N_COLS],
            method=tf.image.ResizeMethod.BILINEAR,
            antialias=False,
        )
        data = tf.squeeze(data, axis=[2])
        # Resize Non Empty Frame Indices
        non_empty_frames_idxs = tf.image.resize(
            non_empty_frames_idxs[:,tf.newaxis, tf.newaxis],
            [N_TARGET_FRAMES, 1],
            method=tf.image.ResizeMethod.BILINEAR,
            antialias=False,
        )
        non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=[1,2])
        
        return data, non_empty_frames_idxs

    
preprocess_layer = PreprocessLayer()

In [10]:
# Landmark Indices in subset of dataframe with only COLUMNS selected
LEFT_HAND_IDXS = np.argwhere(np.isin(COLUMNS, LEFT_HAND_NAMES0)).squeeze()
RIGHT_HAND_IDXS = np.argwhere(np.isin(COLUMNS, RIGHT_HAND_NAMES0)).squeeze()
N_COLS = LEFT_HAND_IDXS.size
# Only X/Y axes are used
N_DIMS = 2

print(f'N_COLS: {N_COLS}')

N_COLS: 42


In [11]:
# Split Phrase To Char Tuple
train['phrase_char'] = train['phrase'].apply(tuple)
# Character Length of Phrase
train['phrase_char_len'] = train['phrase_char'].apply(len)

# Maximum Input Length
MAX_PHRASE_LENGTH = train['phrase_char_len'].max()
print(f'MAX_PHRASE_LENGTH: {MAX_PHRASE_LENGTH}')

MAX_PHRASE_LENGTH: 31


In [12]:
# Use Set to keep track of unique characters in phrases
UNIQUE_CHARACTERS = set()

for phrase in tqdm(train['phrase_char']):
    for c in phrase:
        UNIQUE_CHARACTERS.add(c)
        
# Sorted Unique Character
UNIQUE_CHARACTERS = np.array(sorted(UNIQUE_CHARACTERS))
# Number of Unique Characters
N_UNIQUE_CHARACTERS = len(UNIQUE_CHARACTERS)
print(f'N_UNIQUE_CHARACTERS: {N_UNIQUE_CHARACTERS}')

  0%|          | 0/5000 [00:00<?, ?it/s]

N_UNIQUE_CHARACTERS: 51


In [13]:
# Target Arrays Processed Input Videos
X = np.zeros([N_SAMPLES, N_TARGET_FRAMES, N_COLS], dtype=np.float32)
# Frame Indices
NON_EMPTY_FRAME_IDXS = np.zeros([N_SAMPLES, N_TARGET_FRAMES], dtype=np.uint16)
# Ordinally Encoded Target With value 59 for pad token
y = np.full(shape=[N_SAMPLES, MAX_PHRASE_LENGTH], fill_value=N_UNIQUE_CHARACTERS, dtype=np.int8)

# Train DataFrame indexed by sequence_id to convenientlyy lookup recording data
train_squence_id = train.set_index('sequence_id')

In [14]:
# Read Character to Ordinal Encoding Mapping
with open('/kaggle/input/asl-fingerspelling/character_to_prediction_index.json') as json_file:
    CHAR2ORD = json.load(json_file)
    
# Character to Ordinal Encoding Mapping   
display(pd.Series(CHAR2ORD).to_frame('Ordinal Encoding'))

Unnamed: 0,Ordinal Encoding
,0
!,1
#,2
$,3
%,4
&,5
',6
(,7
),8
*,9


In [15]:
# Number of Unique Characters
N_UNIQUE_CHARACTERS = len(CHAR2ORD)
print(f'N_UNIQUE_CHARACTERS: {N_UNIQUE_CHARACTERS}')

N_UNIQUE_CHARACTERS: 59


In [16]:
# All Unique Parquet Files
UNIQUE_FILE_PATHS = pd.Series(train['file_path'].unique())
# Counter to keep track of sample
row = 0

# Fill Arrays
for idx, file_path in enumerate(tqdm(UNIQUE_FILE_PATHS)):
    df = pd.read_parquet(file_path)
    for group, group_df in df.groupby('sequence_id'):
        # Get Processed Frames and non empty frame indices
        data, non_empty_frames_idxs = preprocess_layer(group_df[COLUMNS].values)
        X[row] = data
        NON_EMPTY_FRAME_IDXS[row] = non_empty_frames_idxs
        # Add Target By Ordinally Encoding Characters
        phrase_char = train_squence_id.loc[group, 'phrase_char']
        for col, char in enumerate(phrase_char):
            y[row, col] = CHAR2ORD.get(char)
            
        row += 1

  0%|          | 0/5 [00:00<?, ?it/s]

In [17]:
# Example target, note the phrase is padded with the pad token 59
print(f'Example Target: {y[0]}')

Example Target: [18  0 34 49 36 36 42 39 46 52 50 36 51 51 51 51 51 51 51 51 51 51 51 51
 51 51 51 51 51 51 51]


In [18]:
# Save X/y
np.save('X.npy', X)
np.save('y.npy', y)
np.save('NON_EMPTY_FRAME_IDXS.npy', NON_EMPTY_FRAME_IDXS)
# Save Validation
splitter = GroupShuffleSplit(test_size=0.10, n_splits=2, random_state=SEED)
PARTICIPANT_IDS = train['participant_id'].values
train_idxs, val_idxs = next(splitter.split(X, y, groups=PARTICIPANT_IDS))

# Save Train
np.save('X_train.npy', X[train_idxs])
np.save('y_train.npy', y[train_idxs])
np.save('NON_EMPTY_FRAME_IDXS_TRAIN.npy', NON_EMPTY_FRAME_IDXS[train_idxs])
# Save Validation
np.save('X_val.npy', X[val_idxs])
np.save('y_val.npy', y[val_idxs])
np.save('NON_EMPTY_FRAME_IDXS_VAL.npy', NON_EMPTY_FRAME_IDXS[val_idxs])
# Verify Train/Val is correctly split by participan id
print(f'Patient ID Intersection Train/Val: {set(PARTICIPANT_IDS[train_idxs]).intersection(PARTICIPANT_IDS[val_idxs])}')
# Train/Val Sizes
print(f'# Train Samples: {len(train_idxs)}, # Val Samples: {len(val_idxs)}')

Patient ID Intersection Train/Val: set()
# Train Samples: 4452, # Val Samples: 548


In [19]:
X_train, X_test, y_train, y_test = X[train_idxs], X[val_idxs], y[train_idxs], y[val_idxs]

So far, I've only adjusted the code developed by MARK WIJKHUIZEN, the link to MARK WIJKHUIZEN's work is found at the beginning of this notebook, now it seems to me that the work would be a prediction.

**Honestly, I don't have a clue on how to proceed... If anyone can help me out, I'd appreciate it.**