# Extracting frames and landmarks from dataset

In [290]:
# %load_ext cudf.pandas

In [291]:
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import pandas as pd
import mediapipe as mp
import cv2
import matplotlib.pyplot as plt
from pymongo import MongoClient
import gc

In [292]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
holistic = mp_holistic.Holistic(static_image_mode = False,
                                model_complexity = 2)

In [293]:
connection = MongoClient('localhost', 27017)
db = connection['mydb']
collection = db['Sign_Language_Final_Data']

In [294]:
cursor = collection.find({})
df = pd.DataFrame(list(cursor))

In [295]:
df.head()

Unnamed: 0,_id,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,source,split,url,variation_id,video_id,is_available
0,671b7bc2c6201c92805b4f99,book,"[385, 37, 885, 720]",25,-1,1,0,118,aslbrick,train,http://aslbricks.org/New/ASL-Videos/book.mp4,0,v_id_69241,True
1,671b7bc2c6201c92805b4f9a,book,"[462, 44, 949, 720]",25,-1,1,10,31,signschool,train,https://signstock.blob.core.windows.net/signsc...,0,v_id_07069,True
2,671b7bc2c6201c92805b4f9b,book,"[234, 17, 524, 414]",25,-1,1,17,36,startasl,train,https://s3-us-west-1.amazonaws.com/files.start...,0,v_id_07068,True
3,671b7bc2c6201c92805b4f9c,book,"[131, 26, 526, 480]",25,-1,1,22,59,asldeafined,train,https://media.asldeafined.com/vocabulary/14666...,0,v_id_07070,True
4,671b7bc2c6201c92805b4f9d,book,"[162, 54, 528, 400]",25,-1,1,24,12,aslsearch,val,http://www.aslsearch.com/signs/videos/book.mp4,0,v_id_07099,True


In [296]:
df.drop(columns = ['_id', 'source', 'url'], inplace = True)

In [297]:
df.head()

Unnamed: 0,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,split,variation_id,video_id,is_available
0,book,"[385, 37, 885, 720]",25,-1,1,0,118,train,0,v_id_69241,True
1,book,"[462, 44, 949, 720]",25,-1,1,10,31,train,0,v_id_07069,True
2,book,"[234, 17, 524, 414]",25,-1,1,17,36,train,0,v_id_07068,True
3,book,"[131, 26, 526, 480]",25,-1,1,22,59,train,0,v_id_07070,True
4,book,"[162, 54, 528, 400]",25,-1,1,24,12,val,0,v_id_07099,True


In [298]:
df['video_id'] = df['video_id'].apply(lambda id: id.replace('v_id_', ''))

In [299]:
df.head()

Unnamed: 0,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,split,variation_id,video_id,is_available
0,book,"[385, 37, 885, 720]",25,-1,1,0,118,train,0,69241,True
1,book,"[462, 44, 949, 720]",25,-1,1,10,31,train,0,7069,True
2,book,"[234, 17, 524, 414]",25,-1,1,17,36,train,0,7068,True
3,book,"[131, 26, 526, 480]",25,-1,1,22,59,train,0,7070,True
4,book,"[162, 54, 528, 400]",25,-1,1,24,12,val,0,7099,True


In [300]:
len(df)

11980

In [301]:
df['frame_start'].unique().tolist()

[1, 2, 3]

In [302]:
df['frame_end'].unique().tolist()

[-1]

In [303]:
df['split'].unique().tolist()

['train', 'val', 'test']

In [304]:
(df['split'] == 'train').sum()

8313

In [305]:
(df['split'] == 'test').sum()

1414

In [306]:
(df['split'] == 'val').sum()

2253

In [307]:
df['gloss'].nunique()

1999

In [308]:
def count_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()


    return frame_count

### Normalizing the Landmark points
- It is a really important step which allows use to generated similar landmark pssotion values for same gestures performed in different orientation by introducing
    - Position Invariance
    - Scale Invariance

In [309]:
# def normalize_landmarks(landmarks , x_min , y_min , x_max , y_max , f_width , f_height):
  
#   # Handle different types of landmark inputs
#   if hasattr(landmarks, 'landmark'):
#         landmark_list = [lm for lm in landmarks.landmark]  # For face_landmarks
#   else:
#         landmark_list = landmarks  # For hand landmarks that are already a list
 

#   normalized = []

#   width = x_max - x_min
#   height = y_max - y_min

#   for lm in landmarks:
    
#     norm_x = ((lm.x *f_width) - x_min)/(width)
#     norm_y = ((lm.y *f_height) - y_min)/(height)
#     norm_z = lm.z

#     normalized.append((norm_x , norm_y , norm_z))
  
#   return normalized

In [310]:
def normalize_landmarks(landmarks, x_min, y_min, x_max, y_max , f_width , f_height):
    """
    Normalize landmarks from MediaPipe output with respect to bounding box dimensions
    
    The formula is:
    x_normalized = (lm.x * width - x_min) / (x_max - x_min)
    y_normalized = (lm.y * height - y_min) / (y_max - y_min)
    """
    # Get frame dimensions from the bounding box
    width = x_max - x_min
    height = y_max - y_min
    
    normalized = []
    if hasattr(landmarks, 'landmark'):
        for lm in landmarks.landmark:
            # MediaPipe gives normalized coordinates (0-1) relative to the whole frame
            # Convert to pixel coordinates by multiplying with dimensions
            x_pixels = lm.x * f_width
            y_pixels = lm.y * f_height
            
            # Normalize with respect to the bounding box
            norm_x = (x_pixels - x_min) / (x_max - x_min)
            norm_y = (y_pixels - y_min) / (y_max - y_min)
            # Keep z as is since it's already normalized in MediaPipe
            norm_z = lm.z
            
            normalized.append((norm_x, norm_y, norm_z))
    else:
        # Landmarks are already a list of (x, y, z) tuples
        for lm in landmarks:
            x_pixels = lm[0] * f_width
            y_pixels = lm[1] * f_height
            
            norm_x = (x_pixels - x_min) / (x_max - x_min)
            norm_y = (y_pixels - y_min) / (y_max - y_min)
            norm_z = lm[2]
            
            normalized.append((norm_x, norm_y, norm_z))
    
    return normalized

In [311]:
# def extract_landmarks(video_id, frame_start, frame_end , label , bb_data):

#     #Creating a video path 
    
#     video_path = rf'C:\Users\Sahil\Desktop\Talkwithhands dataset\versions\5\videos\{video_id}.mp4'

#     cap = cv2.VideoCapture(video_path)

#     if not cap.isOpened():
#       print(f"The video {video_id} failed to open")
#       return None
    
#     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
#     frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

#     x_min , y_min , x_max , y_max = bb_data
    
#     total_frames = count_frames(video_path)

#     if frame_end == -1:
#         frame_end = total_frames - 1

#     landmarks_sequence = []
    
#     cap.set(cv2.CAP_PROP_POS_FRAMES, frame_start)

#     while cap.isOpened() and frame_start <= frame_end:
#         current_frame = int(cap.get(cv2.CAP_PROP_POS_FRAMES))

#         if current_frame > frame_end:
#             break
        
#         ret, frame = cap.read()
#         if not ret:
#             break

#         #Crop the image using bounding box 
#         cropped_frame = frame[y_min:y_max , x_min:x_max]

#         image_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB)

#         results = holistic.process(image_rgb)

#         frame_landmarks = {} #stores face and hand landmarks for a frame

#         if results.face_landmarks:
#             frame_landmarks["face"] = normalize_landmarks(results.face_landmarks,x_min , y_min , x_max , y_max ,frame_width , frame_height)
#         if results.left_hand_landmarks:
#             frame_landmarks["left_hand"] = normalize_landmarks(results.left_hand_landmarks.landmark,x_min , y_min , x_max , y_max,frame_width , frame_height)
#         if results.right_hand_landmarks:
#             frame_landmarks["right_hand"] = normalize_landmarks(results.right_hand_landmarks,x_min , y_min , x_max , y_max,frame_width , frame_height)
        
        
#         landmarks_sequence.append(frame_landmarks)

#     cap.release()

#     #Creating a dict element for each video 
#     video_landmark_dict_element = {
#         'landmarks' : landmarks_sequence , 
#         'label': label
#     }


#     return video_landmark_dict_element

In [312]:
def extract_landmarks(video_id, frame_start, frame_end , label , bb_data):

    #Creating a video path 
    
    video_path = rf'C:\Users\Sahil\Desktop\Talkwithhands dataset\versions\5\videos\{video_id}.mp4'

    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
      print(f"The video {video_id} failed to open")
      return None
    
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    x_min , y_min , x_max , y_max = bb_data
    
    total_frames = count_frames(video_path)

    if frame_end == -1:
        frame_end = total_frames - 1

    landmarks_sequence = []
    
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_start)

    while cap.isOpened() and frame_start <= frame_end:
        current_frame = int(cap.get(cv2.CAP_PROP_POS_FRAMES))

        if current_frame > frame_end:
            break
        
        ret, frame = cap.read()
        if not ret:
            break

        #Crop the image using bounding box 
        cropped_frame = frame[y_min:y_max , x_min:x_max]

        image_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB)

        results = holistic.process(image_rgb)

        frame_landmarks = {} #stores face and hand landmarks for a frame

        if results.face_landmarks:
            frame_landmarks["face"] = [(lm.x, lm.y, lm.z) for lm in results.face_landmarks.landmark]
        if results.left_hand_landmarks:
            frame_landmarks["left_hand"] = [(lm.x, lm.y, lm.z) for lm in results.left_hand_landmarks.landmark]
        if results.right_hand_landmarks:
            frame_landmarks["right_hand"] = [(lm.x, lm.y, lm.z) for lm in results.right_hand_landmarks.landmark]
        
        
        landmarks_sequence.append(frame_landmarks)

    cap.release()

    #Creating a dict element for each video 
    video_landmark_dict_element = {
        'landmarks' : landmarks_sequence , 
        'label': label
    }


    return video_landmark_dict_element

In [313]:
#Defining an empty list to store data of each video as a dict element
video_data = []

# Applying the pre-processing function to every record

df.progress_apply(lambda record : video_data.append(
 extract_landmarks(
  video_id= record['video_id'],
  frame_start= record['frame_start'],
  frame_end= record['frame_end'],
  label= record['gloss'],
  bb_data = record['bbox']
 )
) , axis = 1)


  1%|          | 88/11980 [11:24<25:41:09,  7.78s/it]


KeyboardInterrupt: 

In [None]:
print(df.columns)

Index(['gloss', 'bbox', 'fps', 'frame_end', 'frame_start', 'instance_id',
       'signer_id', 'split', 'variation_id', 'video_id', 'is_available'],
      dtype='object')


In [None]:
# import warnings
# warnings.filterwarnings("ignore")

# landmarks_data = []

# for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing videos"):
#     video_id = row['video_id']
#     frame_start = row['frame_start']
#     frame_end = row['frame_end']
#     video_path = f'./kaggle-dataset/videos/{video_id}.mp4'

#     landmarks_sequence = extract_landmarks(video_path, frame_start, frame_end)

Processing videos:   0%|          | 0/11980 [00:00<?, ?it/s]W0000 00:00:1730027111.618326  402130 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
Processing videos:   2%|▏         | 224/11980 [24:37<21:32:37,  6.60s/it]


KeyboardInterrupt: 

In [None]:
# cap = cv2.VideoCapture(0)

# with mp_holistic.Holistic(static_image_mode=False, 
#                            model_complexity=2, 
#                            enable_segmentation=True,
#                            min_detection_confidence=0.5,
#                            min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened():
#         ret,frame = cap.read()
#         if not ret:
#             print("Ignoring empty camera frame")
#             continue

#         frame = cv2.flip(frame, 1)
        
#         image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#         image.flags.writeable = False

#         results = holistic.process(image)

#         image.flags.writeable = True
#         image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

#         if results.face_landmarks:
#              mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS)
#         # if results.pose_landmarks:
#         #      mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
#         if results.left_hand_landmarks:
#              mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
#         if results.right_hand_landmarks:
#              mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

#         cv2.imshow('TEST', image)

#         if cv2.waitKey(5) & 0xFF == 27:  # Press 'Esc' to exit
#                 break

# cap.release()
# cv2.destroyAllWindows()
