## Preprocessing the Dataset
This notebook processes the entirety of the IPN Hand Dataset(30Gb) into hand crafted preprocesed features extracted using the skeleton from Mediapipe Hands Skeleton Model. It will be divided into 5 batches to avoid Memory Errors due to RAM limitations

In [2]:
import cv2
import mediapipe as mp
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import time
%matplotlib inline

#### Mediapipe Hands Skeleton Model

In [3]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
hands = mp_hands.Hands(
    max_num_hands=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5)

#### Dividing Annotation List into 5 batches

In [3]:
# Load the Main Annotation File
base_path = 'Dataset'
main_annotation_file = 'Annotations/Annot_List.txt'

with open(main_annotation_file,'r') as file:
    lines = file.readlines()

#removing header
header =lines[0]
data = lines[1:]

#read batch directories
batch_dirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]

# Process each batch directory
for x in batch_dirs:
    batch_path = os.path.join(base_path, x, 'frames')
    video_names = [d for d in os.listdir(batch_path) if os.path.isdir(os.path.join(batch_path, d))]
    video_names_set = set(video_names)
    print(f"{x}:{len(video_names)} Videos Found")

    #look up all the annotations in data found in video_names
    batch_annotations = [line for line in data if line.split(',')[0] in video_names_set]

    #writing to output file
    batch_annotation_file = f"Annotations/{x}_Annot_List.txt"
    with open(batch_annotation_file, 'w') as file:
        file.write(header)  # Write the header
        file.writelines(batch_annotations)  # Write the filtered data
print("Annotation Processing Complete")


Batch1:40 Videos Found
Batch2:40 Videos Found
Batch3:40 Videos Found
Batch4:40 Videos Found
Batch5:40 Videos Found
Annotation Processing Complete


#### Extracting Label, Start Frames, and End Frames for All Batches

In [4]:
Batches_Dataset =[]
for i in range(1,6):
    batch_annotation = f"Annotations/Batch{i}_Annot_List.txt"
    
    with open(batch_annotation,'r') as file:
        lines = file.readlines()
    
    #removing header
    data = lines[1:]
    batch_dataset = {}

    for x in data:
        temp = x.strip().split(',')
        filename = temp[0]
        label = temp[1]
        start_frame = int(temp[3])
        end_frame = int(temp[4])
        
        #append data
        if filename in batch_dataset:
            batch_dataset[filename][0].append(label)
            batch_dataset[filename][1].append(start_frame)
            batch_dataset[filename][2].append(end_frame)
        else:
            batch_dataset[filename] = [[label], [start_frame], [end_frame]]
    
    print(f"Batch{i}:{batch_dataset}")
    Batches_Dataset.append(batch_dataset)

Batch1:{'1CM1_4_R_#229': [['D0X', 'G11', 'B0B', 'G04', 'B0B', 'G05', 'B0B', 'G03', 'B0A', 'D0X', 'G02', 'B0A', 'D0X', 'G08', 'B0A', 'G06', 'B0B', 'G10', 'B0B', 'D0X', 'G09', 'B0A', 'G07', 'B0A', 'G01', 'D0X'], [1, 18, 56, 285, 309, 503, 545, 858, 900, 1123, 1433, 1458, 1708, 2043, 2078, 2351, 2392, 2604, 2647, 2795, 2994, 3030, 3233, 3278, 3646, 3677], [17, 55, 284, 308, 502, 544, 857, 899, 1122, 1432, 1457, 1707, 2042, 2077, 2350, 2391, 2603, 2646, 2794, 2993, 3029, 3232, 3277, 3645, 3676, 3751]], '1CM1_4_R_#230': [['D0X', 'G10', 'B0B', 'G11', 'B0A', 'G03', 'B0B', 'G09', 'B0A', 'G08', 'B0B', 'D0X', 'G07', 'B0B', 'D0X', 'G02', 'B0B', 'D0X', 'G04', 'B0A', 'G01', 'B0A', 'G06', 'B0A', 'G05', 'D0X'], [1, 25, 66, 348, 382, 580, 617, 861, 902, 1150, 1209, 1543, 1795, 1838, 2117, 2415, 2445, 2726, 2808, 2865, 2974, 3007, 3224, 3277, 3584, 3633], [24, 65, 347, 381, 579, 616, 860, 901, 1149, 1208, 1542, 1794, 1837, 2116, 2414, 2444, 2725, 2807, 2864, 2973, 3006, 3223, 3276, 3583, 3632, 3684]], 

#### Folder Images Helper Function

In [5]:
import os

def image_path_func(folder_path):
    image_path_list =[]
    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is an image file
        if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
            # Construct the full path to the image file
            
            image_path = os.path.join(folder_path, filename)
            image_path_list.append(image_path)
    return image_path_list

### Features Helper Function

In [6]:
def get_features(idx,res=None):
        joint = np.zeros((21,4))

        #populating joint matrix with x,y,z, and visibility
        if(res != None):
                for j, lm in enumerate(res.landmark):
                        joint[j] = [lm.x, lm.y, lm.z, lm.visibility]

        #Compute limb angles between joints
        v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3] #Parent Joint
        v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3] #Child Joint
        v = v2 - v1 #20x3
        #Normalize
        v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]
        #Get angle in between vectors using dot product
        angle = np.arccos(np.einsum('nt,nt->n',
        v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
        v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

        angle = np.degrees(angle) # Convert radian to degree
                
        angle_label = np.array([angle], dtype=np.float32)
        angle_label = np.append(angle_label, idx)
        #d is a flattened (x,y,z,angle) from landmark 0 to landmark 20 and the label which is the index
        d = np.concatenate([joint.flatten(), angle_label])
        return(d)

#### Extracting Raw Skeletal Features: Limb Angles, Joint Velocity

In [7]:
GESTURES = ['G01','G02','G03','G04','G05','G06','G07','G08','G09','G10','G11','B0A','B0B']

for z in range(0,5):

    for y in GESTURES:
        os.makedirs(os.path.join(f'Dataset/Batch{z+1}/raw_skeleton', y), exist_ok=True)

    batch_data = Batches_Dataset[z]
    counter2=0
    for x in batch_data:
        counter2 += 1
        counter = 0
        folder_path = f"Dataset/Batch{z+1}/frames/{x}"
        labels = batch_data[x][0]
        start_frames = batch_data[x][1]
        end_frames = batch_data[x][2]
        image_path_list = image_path_func(folder_path)
        
        for label in labels:
            counter+=1
            if(label in GESTURES):
                start = start_frames[labels.index(label)]
                end = end_frames[labels.index(label)]
                idx = GESTURES.index(label)
                data = []
                #process from start to end of the gesture sequence
                for i in range(start,end):
                    image_path = image_path_list[i]
                    image = cv2.imread(image_path)
                    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    result = hands.process(image_rgb)

                    if result.multi_hand_landmarks is not None:
                        for res in result.multi_hand_landmarks:
                            data.append(get_features(idx,res))
    
            data =np.array(data)

            print(f"Batch {z+1}:Folder {counter2}:({x},{label}):Processed: {(i/end_frames[-1])*100}%    Shape:{data.shape}")
            np.save(f'Dataset/Batch{z+1}/raw_skeleton/{label}/raw_{label}_{x}_{counter}', data)


Batch 1:Folder 1:(1CM1_4_R_#229,D0X):Processed: 0.13329778725673155%    Shape:(1173,)
Batch 1:Folder 1:(1CM1_4_R_#229,G11):Processed: 1.4396161023727005%    Shape:(37, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,B0B):Processed: 7.544654758731005%    Shape:(227, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,G04):Processed: 8.184484137563317%    Shape:(23, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,B0B):Processed: 7.544654758731005%    Shape:(227, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,G05):Processed: 14.476139696081045%    Shape:(41, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,B0B):Processed: 7.544654758731005%    Shape:(227, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,G03):Processed: 23.940282591308986%    Shape:(41, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,B0A):Processed: 29.88536390295921%    Shape:(222, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,D0X):Processed: 29.88536390295921%    Shape:(222, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,G02):Processed: 38.816315649160224%    Shape:(23, 100)
Batch 1:Folder 1:(1CM1_4_R_#229,B0A):Processe

#### Concatenating Each and Reshaping to Sequences

In [4]:
for z in range(0,5):
    folders_path = f'Dataset/Batch{z+1}/raw_skeleton'
    # Iterate over each folder (G02 to G11)
    print("CONCATENATING")
    for folder_name in os.listdir(folders_path):
        folder_path = os.path.join(folders_path, folder_name)
        
        folder_arrays = []
        # Iterate over each file in the folder
        for index, file_name in enumerate(os.listdir(folder_path)):
            
            file_path = os.path.join(folder_path, file_name)
            
            # Load the array from the file
            array = np.load(file_path)
            
            # Append the array to folder_arrays if it's not empty
            if array.size != 0:
                folder_arrays.append(array)
        
        # Concatenate arrays from the current folder and save
        if folder_arrays:
            concatenated_array = np.concatenate(folder_arrays, axis=0)
            np.save(os.path.join(f'Dataset/Batch{z+1}/proc_skeleton', f'{folder_name}_full.npy'), concatenated_array)
            print(f'{folder_name} done processing')
        else:
            print(f'No arrays to concatenate in folder {folder_name}')

    print("SEQUENCING")
    folders_path = f'Dataset/Batch{z+1}/proc_skeleton'
    seq_length = 65
    for file_name in os.listdir(folders_path):
        file_path = os.path.join(folders_path, file_name)
        data = np.load(file_path)

        full_seq_data = []
        for seq in range(len(data) - seq_length):
            full_seq_data.append(data[seq:seq + seq_length])

        full_seq_data = np.array(full_seq_data)
        print(full_seq_data.shape)
        np.save(os.path.join(f'Dataset/Batch{z+1}/seq_skeleton',f'seq_{file_name}'), full_seq_data)

CONCATENATING
B0A done processing
B0B done processing
G01 done processing
G02 done processing
G03 done processing
G04 done processing
G05 done processing
G06 done processing
G07 done processing
G08 done processing
G09 done processing
G10 done processing
G11 done processing
SEQUENCING
(42593, 65, 100)
(40981, 65, 100)
(1604, 65, 100)
(1839, 65, 100)
(2092, 65, 100)
(2131, 65, 100)
(2240, 65, 100)
(2105, 65, 100)
(2421, 65, 100)
(2169, 65, 100)
(2012, 65, 100)
(2071, 65, 100)
(2232, 65, 100)
CONCATENATING
B0A done processing
B0B done processing
G01 done processing
G02 done processing
G03 done processing
G04 done processing
G05 done processing
G06 done processing
G07 done processing
G08 done processing
G09 done processing
G10 done processing
G11 done processing
SEQUENCING
(48796, 65, 100)
(48734, 65, 100)
(1681, 65, 100)
(1734, 65, 100)
(1954, 65, 100)
(1962, 65, 100)
(2103, 65, 100)
(2015, 65, 100)
(2338, 65, 100)
(2038, 65, 100)
(2035, 65, 100)
(2087, 65, 100)
(2097, 65, 100)
CONCATENAT