## Preprocessing the Dataset Created
- Normalizing the dataset
- Converting the labels into one-hot encoded form

In [32]:

from tensorflow.keras.utils import to_categorical
import numpy as np
import os
from tqdm import tqdm
tqdm.pandas()
from pymongo import MongoClient
import pandas as pd

In [33]:
from Create_dataset import actions , no_of_videos , no_of_frames , DATA_PATH

In [34]:
# File Path to store the landmark data

DATA_PATH = os.path.join("LANDMARK_DATA")

# Defining the actions 

actions = np.array(['hello' , 'thankyou' , 'iloveyou' , 'sorry' ,'A' , 'B' , 'C' , 'D' , 'E' , 'F'])
# Defining the number of videos per action

no_of_videos = 100

# Defining the number of frames per video

no_of_frames = 30

In [35]:
for num , label in enumerate(actions):
 print(num , label)

0 hello
1 thankyou
2 iloveyou
3 sorry
4 A
5 B
6 C
7 D
8 E
9 F


In [36]:
actions

array(['hello', 'thankyou', 'iloveyou', 'sorry', 'A', 'B', 'C', 'D', 'E',
       'F'], dtype='<U8')

In [37]:
# Creating Label Map 

label_map = {label:num for num , label in enumerate(actions)}

In [38]:
label_map

{'hello': 0,
 'thankyou': 1,
 'iloveyou': 2,
 'sorry': 3,
 'A': 4,
 'B': 5,
 'C': 6,
 'D': 7,
 'E': 8,
 'F': 9}

In [39]:
 # importing the keypoint data stored as numpy arrays locally

video_data , labels = [] , []

for action in actions:
  for video in range(no_of_videos):

   window = []

   for frame_num in range(no_of_frames):
     res = np.load(os.path.join(DATA_PATH , action , str(video) , "{}.npy".format(frame_num)))
     window.append(res)
   
   video_data.append(window)
   labels.append(label_map[action])

NOTE : The key points extracted from mediapipe are already normalized between 0-1

In [40]:
np.array(video_data).shape

(1000, 30, 1662)

In [41]:
video_data

[[array([ 0.76913083,  0.36508131, -1.72161901, ...,  0.        ,
          0.        ,  0.        ]),
  array([ 0.58526409,  0.36181429, -1.094607  , ...,  0.        ,
          0.        ,  0.        ]),
  array([ 0.56363392,  0.35014695, -1.03521442, ...,  0.        ,
          0.        ,  0.        ]),
  array([ 0.5798704 ,  0.35038581, -1.04793489, ...,  0.        ,
          0.        ,  0.        ]),
  array([ 0.58473909,  0.35049546, -1.12020528, ...,  0.        ,
          0.        ,  0.        ]),
  array([ 0.58352721,  0.35064778, -1.16638708, ...,  0.        ,
          0.        ,  0.        ]),
  array([ 0.57935637,  0.35047883, -1.14591491, ...,  0.        ,
          0.        ,  0.        ]),
  array([ 0.57646435,  0.35072193, -1.12760258, ...,  0.        ,
          0.        ,  0.        ]),
  array([ 0.57990861,  0.35209304, -1.07343411, ...,  0.        ,
          0.        ,  0.        ]),
  array([ 0.58016616,  0.3522177 , -1.05635047, ...,  0.        ,
       

In [42]:
np.array(labels).shape

(1000,)

In [43]:
X = np.array(video_data)

In [44]:
X.shape

(1000, 30, 1662)

In [45]:
X

array([[[ 0.76913083,  0.36508131, -1.72161901, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.58526409,  0.36181429, -1.094607  , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.56363392,  0.35014695, -1.03521442, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.48076433,  0.39766535, -0.78489423, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.48214206,  0.4095569 , -0.83610785, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.48313302,  0.41441044, -0.85652077, ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.48280469,  0.41313297, -0.9219079 , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.48468822,  0.40871626, -1.33471262, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.48701519,  0.40479678, -1.40227401, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.51010078,  0.34224483, -0.99797851, ...,  

In [46]:
y = to_categorical(labels).astype(int)

In [47]:
y.shape

(1000, 10)

In [48]:
len(X)

1000

In [49]:
preprocessed_df = pd.DataFrame({"Landmarks" : video_data , "Labels" : labels})

In [50]:
preprocessed_df.head()

Unnamed: 0,Landmarks,Labels
0,"[[0.7691308259963989, 0.3650813102722168, -1.7...",0
1,"[[0.48280468583106995, 0.4131329655647278, -0....",0
2,"[[0.5053147077560425, 0.3372049331665039, -0.9...",0
3,"[[0.44065794348716736, 0.3381093442440033, -1....",0
4,"[[0.4760499894618988, 0.32905495166778564, -1....",0


In [51]:
for x in (preprocessed_df["Landmarks"][0]):
 print(type(x))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [52]:
# Converting each frame which is array form to list form for each video to insert into MongoDB as mongoDB requires data to be in the form of lists
preprocessed_df['Landmarks'] = preprocessed_df["Landmarks"].progress_apply(lambda landmarks: [arr.tolist() if isinstance(arr , np.ndarray) else arr for arr in landmarks] )

100%|██████████| 1000/1000 [00:11<00:00, 88.69it/s]


In [53]:
type(preprocessed_df['Landmarks'][0][0])

list

In [54]:
records = preprocessed_df.to_dict(orient='records')

In [55]:
type(records[1]['Landmarks'])

list

In [56]:
# Estabishing MongoDB connection

connection = MongoClient('localhost' , 27017)
db = connection['mydb']
collection = db['Preprocessed_Landmark_Data']

In [57]:
collection.insert_many(records)

InsertManyResult([ObjectId('675c2f1500d474e9f96ed265'), ObjectId('675c2f1500d474e9f96ed266'), ObjectId('675c2f1500d474e9f96ed267'), ObjectId('675c2f1500d474e9f96ed268'), ObjectId('675c2f1500d474e9f96ed269'), ObjectId('675c2f1500d474e9f96ed26a'), ObjectId('675c2f1500d474e9f96ed26b'), ObjectId('675c2f1500d474e9f96ed26c'), ObjectId('675c2f1500d474e9f96ed26d'), ObjectId('675c2f1500d474e9f96ed26e'), ObjectId('675c2f1500d474e9f96ed26f'), ObjectId('675c2f1500d474e9f96ed270'), ObjectId('675c2f1500d474e9f96ed271'), ObjectId('675c2f1500d474e9f96ed272'), ObjectId('675c2f1500d474e9f96ed273'), ObjectId('675c2f1500d474e9f96ed274'), ObjectId('675c2f1500d474e9f96ed275'), ObjectId('675c2f1500d474e9f96ed276'), ObjectId('675c2f1500d474e9f96ed277'), ObjectId('675c2f1500d474e9f96ed278'), ObjectId('675c2f1500d474e9f96ed279'), ObjectId('675c2f1500d474e9f96ed27a'), ObjectId('675c2f1500d474e9f96ed27b'), ObjectId('675c2f1500d474e9f96ed27c'), ObjectId('675c2f1500d474e9f96ed27d'), ObjectId('675c2f1500d474e9f96ed2