In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import pyarrow.parquet as pq

In [2]:
MAX_SEQ_LEN = 135 #135 for 95th percentile , or 90 for 90th percentile
#pad sequences with length less than max_seq_length with value -1
#truncate sequences with length more than max_seq_length to max_seq_length
PAD_VALUE = -1

In [3]:
#dataset4 is the dataset obtained from step 2
dataset6 = []
for dirname, _, filenames in os.walk('dataset6/asl-signs/train_landmark_files'):
    for filename in filenames:
        if filename.endswith('.parquet'):
            dataset6.append(os.path.join(dirname, filename))
dataset6

['dataset6/asl-signs/train_landmark_files/4718/1160474191.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/1187990396.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/2604668083.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/2266974533.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/3835935597.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/2057287272.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/3488774387.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/3210689405.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/2905175954.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/1996140943.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/2516296285.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/2462592703.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/1977311846.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/592543759.parquet',
 'dataset6/asl-signs/train_landmark_files/4718/38

In [7]:
truncate_methods = ['starting', 'ending', 'middle', 'random sampling', 'uniform sampling']

In [8]:
def pad_end(data, max_length, pad_value):
    rows_to_add = max_length - len(data)
    padded_data = {}
    for column in data.columns:
        padded_data[column] = [pad_value] * rows_to_add
    return data.append(pd.DataFrame(padded_data))

In [9]:
import random

def truncate_start(data, max_length):
    return data.iloc[:max_length, :]

def truncate_end(data, max_length):
    return data.iloc[-max_length:,:]

def truncate_middle(data, max_length):
    if len(data)<=max_length:
        return data
    else:
        mid = len(data)//2
        start = mid - (max_length // 2)
        end = start + max_length
        return data.iloc[start:end,:]
    
def truncate_random_sampling(data, max_length):
    if len(data) <= max_length:
        return data
    else:
        start = random.randint(0, len(data) - max_length)
        end = start + max_length
        return data.iloc[start:end,:]

def truncate_uniform(data, max_length, pad_value):
    uniform_samples = pd.DataFrame(columns=data.columns)

    for i in range(0, len(data), round(len(data)/MAX_SEQ_LEN)):
        uniform_samples = uniform_samples.append(data.iloc[i])

    return pad_end(uniform_samples, max_length, pad_value)

In [17]:
def pad_or_truncate_data(data, max_length, pad_value):
    if len(data)>max_length:
        return truncate_uniform(data, max_length, pad_value)
    elif len(data)<max_length:
        return pad_end(data, max_length, pad_value)
    else:
        return data

SAVE PADDED AND TRUNCATED DATA

In [18]:
input_dir = 'dataset6/asl-signs/train_landmark_files'
output_dir = 'dataset4/asl-signs/train_landmark_files'
for directory in tqdm(dataset6):
    # extract the common part of the file path
    path_parts = directory.split("/")
    frame_num = path_parts[-2]
    file_name = path_parts[-1]
    input_file_path = os.path.join(input_dir, frame_num, file_name)
    output_file_path = os.path.join(output_dir, frame_num, file_name)
    
    # read the parquet file into a pandas dataframe
    df = pd.read_parquet(directory)
    
    # preprocess the data
    new_data = pad_or_truncate_data(df, MAX_SEQ_LEN, pad_value = PAD_VALUE)
    
    # save the data to a new directory
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    new_data.to_parquet(output_file_path)

100%|██████████| 94477/94477 [1:46:36<00:00, 14.77it/s]  


This preprocessed data can be used for training all the three models: Transformer, LSTM, RNN