# Extracting frames and landmarks from dataset

In [74]:
# %load_ext cudf.pandas

In [75]:
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import pandas as pd
import mediapipe as mp
import cv2
import matplotlib.pyplot as plt
from pymongo import MongoClient
import gc
import tensorflow as tf

In [76]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
holistic = mp_holistic.Holistic(static_image_mode = False,
                                model_complexity = 2)

In [77]:
connection = MongoClient('localhost', 27017)
db = connection['mydb']
collection = db['Sign_Language_Final_Data']

In [78]:
cursor = collection.find({})
df = pd.DataFrame(list(cursor))

In [79]:
df.head()

Unnamed: 0,_id,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,source,split,url,variation_id,video_id,is_available
0,671b7bc2c6201c92805b4f99,book,"[385, 37, 885, 720]",25,-1,1,0,118,aslbrick,train,http://aslbricks.org/New/ASL-Videos/book.mp4,0,v_id_69241,True
1,671b7bc2c6201c92805b4f9a,book,"[462, 44, 949, 720]",25,-1,1,10,31,signschool,train,https://signstock.blob.core.windows.net/signsc...,0,v_id_07069,True
2,671b7bc2c6201c92805b4f9b,book,"[234, 17, 524, 414]",25,-1,1,17,36,startasl,train,https://s3-us-west-1.amazonaws.com/files.start...,0,v_id_07068,True
3,671b7bc2c6201c92805b4f9c,book,"[131, 26, 526, 480]",25,-1,1,22,59,asldeafined,train,https://media.asldeafined.com/vocabulary/14666...,0,v_id_07070,True
4,671b7bc2c6201c92805b4f9d,book,"[162, 54, 528, 400]",25,-1,1,24,12,aslsearch,val,http://www.aslsearch.com/signs/videos/book.mp4,0,v_id_07099,True


In [80]:
df.drop(columns = ['_id', 'source', 'url'], inplace = True)

In [81]:
df.head()

Unnamed: 0,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,split,variation_id,video_id,is_available
0,book,"[385, 37, 885, 720]",25,-1,1,0,118,train,0,v_id_69241,True
1,book,"[462, 44, 949, 720]",25,-1,1,10,31,train,0,v_id_07069,True
2,book,"[234, 17, 524, 414]",25,-1,1,17,36,train,0,v_id_07068,True
3,book,"[131, 26, 526, 480]",25,-1,1,22,59,train,0,v_id_07070,True
4,book,"[162, 54, 528, 400]",25,-1,1,24,12,val,0,v_id_07099,True


In [82]:
df['video_id'] = df['video_id'].apply(lambda id: id.replace('v_id_', ''))

In [83]:
df.head()

Unnamed: 0,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,split,variation_id,video_id,is_available
0,book,"[385, 37, 885, 720]",25,-1,1,0,118,train,0,69241,True
1,book,"[462, 44, 949, 720]",25,-1,1,10,31,train,0,7069,True
2,book,"[234, 17, 524, 414]",25,-1,1,17,36,train,0,7068,True
3,book,"[131, 26, 526, 480]",25,-1,1,22,59,train,0,7070,True
4,book,"[162, 54, 528, 400]",25,-1,1,24,12,val,0,7099,True


In [84]:
len(df)

11980

In [85]:
df['frame_start'].unique().tolist()

[1, 2, 3]

In [86]:
df['frame_end'].unique().tolist()

[-1]

In [87]:
df['split'].unique().tolist()

['train', 'val', 'test']

In [88]:
(df['split'] == 'train').sum()

8313

In [89]:
(df['split'] == 'test').sum()

1414

In [90]:
(df['split'] == 'val').sum()

2253

In [91]:
df['gloss'].nunique()

1999

In [92]:
def count_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()


    return frame_count

### Normalizing the Landmark points
- It is a really important step which allows use to generated similar landmark pssotion values for same gestures performed in different orientation by introducing
    - Position Invariance
    - Scale Invariance

In [93]:
def normalize_landmarks(x , y, z , x_min , y_min , x_max , y_max , f_width , f_height):
  
  # Handle different types of landmark inputs
  # if hasattr(landmarks, 'landmark'):
  #       landmark_list = [lm for lm in landmarks.landmark]  # For face_landmarks
  # else:
  #       landmark_list = landmarks  # For hand landmarks that are already a list
 

  

  width = x_max - x_min
  height = y_max - y_min
    
  norm_x = ((x *f_width) - x_min)/(width)
  norm_y = ((y *f_height) - y_min)/(height)
  norm_z = z

  normalized = (norm_x , norm_y , norm_z)
  
  return normalized

In [94]:
def format_frames(frame , output_size , x_min , y_min , x_max , y_max , f_width , f_height):
  
  n_width , n_height = output_size

  # Along with resizing the frames we need to rescale the bounding box values

  width_scale_factor = (n_width / f_width )
  height_scale_factor = (n_height / f_height)

  n_x_min = x_min * width_scale_factor
  n_y_min =  y_min * height_scale_factor
  n_x_max = x_max * width_scale_factor
  n_y_max = y_max * height_scale_factor

  # frame = tf.image.convert_image_dtype(frame , tf.float32)

  frame = cv2.resize(frame , output_size)

  return frame ,  n_x_min , n_y_min , n_x_max , n_y_max



  

In [95]:
# def extract_landmarks(video_id, frame_start, frame_end , label , bb_data):

#     #Creating a video path 
    
#     video_path = rf'C:\Users\Sahil\Desktop\Talkwithhands dataset\versions\5\videos\{video_id}.mp4'

#     cap = cv2.VideoCapture(video_path)

#     if not cap.isOpened():
#       print(f"The video {video_id} failed to open")
#       return None
    
#     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
#     frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

#     x_min , y_min , x_max , y_max = bb_data
    
#     total_frames = count_frames(video_path)

#     if frame_end == -1:
#         frame_end = total_frames - 1

#     landmarks_sequence = []
    
#     cap.set(cv2.CAP_PROP_POS_FRAMES, frame_start)

#     while cap.isOpened() and frame_start <= frame_end:
#         current_frame = int(cap.get(cv2.CAP_PROP_POS_FRAMES))

#         if current_frame > frame_end:
#             break
        
#         ret, frame = cap.read()
#         if not ret:
#             break

#         #Crop the image using bounding box 
#         cropped_frame = frame[y_min:y_max , x_min:x_max]

#         image_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB)

#         results = holistic.process(image_rgb)

#         frame_landmarks = {} #stores face and hand landmarks for a frame

#         if results.face_landmarks:
#             frame_landmarks["face"] = normalize_landmarks(results.face_landmarks,x_min , y_min , x_max , y_max ,frame_width , frame_height)
#         if results.left_hand_landmarks:
#             frame_landmarks["left_hand"] = normalize_landmarks(results.left_hand_landmarks.landmark,x_min , y_min , x_max , y_max,frame_width , frame_height)
#         if results.right_hand_landmarks:
#             frame_landmarks["right_hand"] = normalize_landmarks(results.right_hand_landmarks,x_min , y_min , x_max , y_max,frame_width , frame_height)
        
        
#         landmarks_sequence.append(frame_landmarks)

#     cap.release()

#     #Creating a dict element for each video 
#     video_landmark_dict_element = {
#         'landmarks' : landmarks_sequence , 
#         'label': label
#     }


#     return video_landmark_dict_element

In [96]:
def extract_landmarks(video_id, frame_start, frame_end , label , bb_data  ):

    #Creating a video path 
    
    video_path = rf'C:\Users\Sahil\Desktop\Talkwithhands dataset\versions\5\videos\{video_id}.mp4'

    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
      print(f"The video {video_id} failed to open")
      return None
    
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))


    x_min , y_min , x_max , y_max = bb_data
    
    total_frames = count_frames(video_path)

    if frame_end == -1:
        frame_end = total_frames - 1

    landmarks_sequence = []
    
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_start)
      
    sequence_length = 30 

    skip_frames_window = max(int(total_frames/sequence_length) , 1)

    #Iteratig through the video to extract fixed number of frames
    for frame_counter in range(sequence_length):
        current_frame = cap.set(cv2.CAP_PROP_POS_FRAMES , frame_counter * skip_frames_window)

        if current_frame > frame_end:
            break
        
        ret, frame = cap.read()
        if not ret:
            break

        #Resizing and scaling bouding box

        frame , n_x_min , n_y_min , n_x_max , n_y_max = format_frames(frame , output_size=(224 , 224) , x_min=x_min ,y_min= y_min ,x_max= x_max , y_max=y_max ,f_width= frame_width ,f_height= frame_height)

        #Crop the image using bounding box 
        cropped_frame = frame[int(n_y_min):int(n_y_max) ,int(n_x_min):int(n_x_max)]

        image_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB)

        results = holistic.process(image_rgb)

        frame_landmarks = {} #stores face and hand landmarks for a frame

        if results.face_landmarks:
            frame_landmarks["face"] = [normalize_landmarks(lm.x, lm.y, lm.z , n_x_min , n_y_min , n_x_max , n_y_max , f_width= 224, f_height=224) for lm in results.face_landmarks.landmark]
        
        else: 
            frame_landmarks['face'] = [( 0 , 0 , 0) for _ in range(468)]

        if results.left_hand_landmarks:
            frame_landmarks["left_hand"] = [normalize_landmarks(lm.x, lm.y, lm.z , n_x_min , n_y_min , n_x_max , n_y_max , f_width=224 , f_height=224) for lm in results.left_hand_landmarks.landmark]

        else:
            frame_landmarks['left_hand'] = [( 0 , 0 , 0) for _ in range(21)]

        if results.right_hand_landmarks:
            frame_landmarks["right_hand"] = [normalize_landmarks(lm.x, lm.y, lm.z , n_x_min , n_y_min , n_x_max , y_max , f_width=224 , f_height=224) for lm in results.right_hand_landmarks.landmark]
        
        else:
            frame_landmarks['right_hand'] = [( 0 , 0 , 0) for _ in range(21)]
        
        landmarks_sequence.append(frame_landmarks)
    
    while len(landmarks_sequence) < sequence_length:

        zero_landmarks = { 
            'face' : [(0 , 0 , 0) for _ in range(468)],
            'left_hand' : [(0 , 0 , 0) for _ in range(21)],
            'right_hand' : [(0 , 0 , 0) for _ in range(21) ]
        
        }

        landmarks_sequence.append(zero_landmarks)

    cap.release()

    #Creating a dict element for each video 
    video_landmark_dict_element = {
        'landmarks' : landmarks_sequence , 
        'label': label
    }


    return video_landmark_dict_element

In [97]:
batch_df = df[:100]

In [98]:
batch_df

Unnamed: 0,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,split,variation_id,video_id,is_available
0,book,"[385, 37, 885, 720]",25,-1,1,0,118,train,0,69241,True
1,book,"[462, 44, 949, 720]",25,-1,1,10,31,train,0,07069,True
2,book,"[234, 17, 524, 414]",25,-1,1,17,36,train,0,07068,True
3,book,"[131, 26, 526, 480]",25,-1,1,22,59,train,0,07070,True
4,book,"[162, 54, 528, 400]",25,-1,1,24,12,val,0,07099,True
...,...,...,...,...,...,...,...,...,...,...,...
95,candy,"[774, 153, 1499, 1080]",25,-1,1,8,39,val,0,08918,True
96,candy,"[647, 60, 1538, 1065]",25,-1,1,9,4,train,1,08919,True
97,candy,"[675, 65, 1529, 1060]",25,-1,1,10,4,train,0,08920,True
98,candy,"[66, 24, 572, 480]",25,-1,1,11,17,train,1,08921,True


In [99]:
#Defining an empty list to store data of each video as a dict element
video_data = []

# Applying the pre-processing function to every record

batch_df.progress_apply(lambda record : video_data.append(
 extract_landmarks(
  video_id= record['video_id'],
  frame_start= record['frame_start'],
  frame_end= record['frame_end'],
  label= record['gloss'],
  bb_data = record['bbox']
 )
) , axis = 1)


100%|██████████| 100/100 [07:17<00:00,  4.37s/it]


0     None
1     None
2     None
3     None
4     None
      ... 
95    None
96    None
97    None
98    None
99    None
Length: 100, dtype: object

In [100]:
type(video_data)

list

In [101]:
len(video_data)

100

In [102]:
type(video_data[0])

dict

In [103]:
preprocessed_batch_df = pd.DataFrame(video_data)

In [104]:
preprocessed_batch_df

Unnamed: 0,landmarks,label
0,"[{'face': [(0.528167724609375, 0.2578373367231...",book
1,"[{'face': [(0.242395827902416, 0.2020864317403...",book
2,"[{'face': [(0.4664744738874764, 0.219961448490...",book
3,"[{'face': [(0.479362738886966, 0.2433891842543...",book
4,"[{'face': [(0.492517174267378, 0.0870498852922...",book
...,...,...
95,"[{'face': [(0.09501079164702321, 0.16571470140...",candy
96,"[{'face': [(0.20198555292371412, 0.22020006891...",candy
97,"[{'face': [(0.08536609051099148, 0.21951115430...",candy
98,"[{'face': [(0.5128556353301399, 0.295628729619...",candy


In [105]:
preprocessed_data_dict = preprocessed_batch_df.to_dict("records")

Writing the preprocessed batch data to mongoDB


In [106]:
# Establishing connection

connection = MongoClient('localhost' , 27017)
db = connection['mydb']
collection = db['Batch_Data']

In [107]:
# inserting the data in the form of dictionary

collection.insert_many(preprocessed_data_dict)

InsertManyResult([ObjectId('6729ae6b068e5f1de5e9010e'), ObjectId('6729ae6b068e5f1de5e9010f'), ObjectId('6729ae6b068e5f1de5e90110'), ObjectId('6729ae6b068e5f1de5e90111'), ObjectId('6729ae6b068e5f1de5e90112'), ObjectId('6729ae6b068e5f1de5e90113'), ObjectId('6729ae6b068e5f1de5e90114'), ObjectId('6729ae6b068e5f1de5e90115'), ObjectId('6729ae6b068e5f1de5e90116'), ObjectId('6729ae6b068e5f1de5e90117'), ObjectId('6729ae6b068e5f1de5e90118'), ObjectId('6729ae6b068e5f1de5e90119'), ObjectId('6729ae6b068e5f1de5e9011a'), ObjectId('6729ae6b068e5f1de5e9011b'), ObjectId('6729ae6b068e5f1de5e9011c'), ObjectId('6729ae6b068e5f1de5e9011d'), ObjectId('6729ae6b068e5f1de5e9011e'), ObjectId('6729ae6b068e5f1de5e9011f'), ObjectId('6729ae6b068e5f1de5e90120'), ObjectId('6729ae6b068e5f1de5e90121'), ObjectId('6729ae6b068e5f1de5e90122'), ObjectId('6729ae6b068e5f1de5e90123'), ObjectId('6729ae6b068e5f1de5e90124'), ObjectId('6729ae6b068e5f1de5e90125'), ObjectId('6729ae6b068e5f1de5e90126'), ObjectId('6729ae6b068e5f1de5e901

In [108]:
# #Defining an empty list to store data of each video as a dict element
# video_data = []

# # Applying the pre-processing function to every record

# df.progress_apply(lambda record : video_data.append(
#  extract_landmarks(
#   video_id= record['video_id'],
#   frame_start= record['frame_start'],
#   frame_end= record['frame_end'],
#   label= record['gloss'],
#   bb_data = record['bbox']
#  )
# ) , axis = 1)


In [109]:
print(df.columns)

Index(['gloss', 'bbox', 'fps', 'frame_end', 'frame_start', 'instance_id',
       'signer_id', 'split', 'variation_id', 'video_id', 'is_available'],
      dtype='object')
