# Extracting frames and landmarks from dataset

# %load_ext cudf.pandas

In [1]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import mediapipe as mp
import cv2
import matplotlib.pyplot as plt
from pymongo import MongoClient
import gc
from joblib import Parallel, delayed
# from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
# from pandarallel import pandarallel
# import dask.dataframe as dd

In [2]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
# holistic = mp_holistic.Holistic(static_image_mode = False,
#                                 model_complexity = 2)

In [3]:
connection = MongoClient('localhost', 27017)
db = connection['mydb']
collection = db['Sign_Language_Final_Data']

In [4]:
cursor = collection.find({})
df = pd.DataFrame(list(cursor))

In [5]:
df.head()

Unnamed: 0,_id,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,source,split,url,variation_id,video_id,is_available
0,671c624d104a133c1d8d0b1c,book,"[385, 37, 885, 720]",25,-1,1,0,118,aslbrick,train,http://aslbricks.org/New/ASL-Videos/book.mp4,0,v_id_69241,True
1,671c624d104a133c1d8d0b1d,book,"[462, 44, 949, 720]",25,-1,1,10,31,signschool,train,https://signstock.blob.core.windows.net/signsc...,0,v_id_07069,True
2,671c624d104a133c1d8d0b1e,book,"[234, 17, 524, 414]",25,-1,1,17,36,startasl,train,https://s3-us-west-1.amazonaws.com/files.start...,0,v_id_07068,True
3,671c624d104a133c1d8d0b1f,book,"[131, 26, 526, 480]",25,-1,1,22,59,asldeafined,train,https://media.asldeafined.com/vocabulary/14666...,0,v_id_07070,True
4,671c624d104a133c1d8d0b20,book,"[162, 54, 528, 400]",25,-1,1,24,12,aslsearch,val,http://www.aslsearch.com/signs/videos/book.mp4,0,v_id_07099,True


In [6]:
df.drop(columns = ['_id', 'source', 'url'], inplace = True)

In [7]:
df.head()

Unnamed: 0,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,split,variation_id,video_id,is_available
0,book,"[385, 37, 885, 720]",25,-1,1,0,118,train,0,v_id_69241,True
1,book,"[462, 44, 949, 720]",25,-1,1,10,31,train,0,v_id_07069,True
2,book,"[234, 17, 524, 414]",25,-1,1,17,36,train,0,v_id_07068,True
3,book,"[131, 26, 526, 480]",25,-1,1,22,59,train,0,v_id_07070,True
4,book,"[162, 54, 528, 400]",25,-1,1,24,12,val,0,v_id_07099,True


In [8]:
df['video_id'] = df['video_id'].str.replace('v_id_', '')

In [9]:
df

Unnamed: 0,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,split,variation_id,video_id,is_available
0,book,"[385, 37, 885, 720]",25,-1,1,0,118,train,0,69241,True
1,book,"[462, 44, 949, 720]",25,-1,1,10,31,train,0,07069,True
2,book,"[234, 17, 524, 414]",25,-1,1,17,36,train,0,07068,True
3,book,"[131, 26, 526, 480]",25,-1,1,22,59,train,0,07070,True
4,book,"[162, 54, 528, 400]",25,-1,1,24,12,val,0,07099,True
...,...,...,...,...,...,...,...,...,...,...,...
11975,wheelchair,"[39, 13, 248, 192]",25,-1,1,5,11,train,0,63047,True
11976,wheelchair,"[163, 62, 625, 400]",25,-1,1,8,12,train,0,63050,True
11977,whistle,"[76, 17, 236, 240]",25,-1,1,2,2,train,0,63186,True
11978,whistle,"[68, 14, 212, 192]",25,-1,1,4,11,train,0,63188,True


In [10]:
len(df)

11980

In [11]:
df['fps'].unique().tolist()

[25]

In [12]:
df['frame_start'].unique().tolist()

[1, 2, 3]

In [13]:
df['frame_end'].unique().tolist()

[-1]

In [14]:
df['split'].unique().tolist()

['train', 'val', 'test']

In [15]:
(df['split'] == 'train').sum()

8313

In [16]:
(df['split'] == 'test').sum()

1414

In [17]:
(df['split'] == 'val').sum()

2253

In [18]:
df['gloss'].nunique()

1999

In [19]:
frame_count_cache = {}
video_capture_cache = {}

def count_frames(video_path):
    if video_path in frame_count_cache:
        return frame_count_cache[video_path]
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    frame_count_cache[video_path] = frame_count
    return frame_count

In [20]:
def extract_landmarks(row):

    with mp.solutions.holistic.Holistic(static_image_mode=False, model_complexity = 2) as holistic:
        video_path = f'./kaggle-dataset/videos/{row.video_id}.mp4'

        if video_path not in video_capture_cache:
            video_capture_cache[video_path] = cv2.VideoCapture(video_path)

        cap = video_capture_cache[video_path]
        
        total_frames = count_frames(video_path)

        frame_start = row.frame_start
        frame_end = total_frames - 1 if row.frame_end == -1 else row.frame_end

        x_min, y_min, x_max, y_max = row.bbox
        skip_interval = int(row.fps / 10)


        landmarks_sequence = []
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_start)

        cropped_width = x_max - x_min
        cropped_height = y_max - y_min

        while cap.isOpened() and frame_start <= frame_end:
            current_frame = int(cap.get(cv2.CAP_PROP_POS_FRAMES))

            if current_frame > frame_end:
                break
            
            ret, frame = cap.read()
            if not ret:
                break

            cropped_frame = frame[y_min:y_max, x_min:x_max]

            image_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB)

            holistic.image_dimensions = (cropped_width, cropped_height)

            results = holistic.process(image_rgb)

            # if results.face_landmarks:
            #     mp_drawing.draw_landmarks(image_rgb, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS)
            # # if results.pose_landmarks:
            # #     mp_drawing.draw_landmarks(image_rgb, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
            # if results.left_hand_landmarks:
            #     mp_drawing.draw_landmarks(image_rgb, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
            # if results.right_hand_landmarks:
            #     mp_drawing.draw_landmarks(image_rgb, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

            # plt.imshow(image_rgb)
            # plt.axis('off')
            # plt.show()

            frame_landmarks = {
                "face": [(lm.x, lm.y, lm.z) for lm in results.face_landmarks.landmark] if results.face_landmarks else None,
                "left_hand": [(lm.x, lm.y, lm.z) for lm in results.left_hand_landmarks.landmark] if results.left_hand_landmarks else None,
                "right_hand": [(lm.x, lm.y, lm.z) for lm in results.right_hand_landmarks.landmark] if results.right_hand_landmarks else None
            }

            
            landmarks_sequence.append(frame_landmarks)

            frame_start += skip_interval
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_start)

        # cap.release()
        # print(f"Processed row for gloss: {row.gloss}, instance_id: {row.instance_id}, landmarks_sequence length: {len(landmarks_sequence)}")

        print (landmarks_sequence[:2])
    
    return (row.gloss, row.instance_id, landmarks_sequence)

In [21]:
print(df.dtypes)


gloss           object
bbox            object
fps              int64
frame_end        int64
frame_start      int64
instance_id      int64
signer_id        int64
split           object
variation_id     int64
video_id        object
is_available      bool
dtype: object


In [22]:
# import warnings
# warnings.filterwarnings("ignore")

# import logging

# # Configure logging
# logging.basicConfig(level=logging.INFO)



# tqdm.pandas(desc="Processing videos")






some_rows = df.iloc[[0, 1, 2, 3, 4,]]

n_jobs = -1

results = Parallel(n_jobs=n_jobs, backend="threading")(
    delayed(extract_landmarks)(row) for row in tqdm(some_rows.itertuples(), total=len(some_rows))
)








# dask_df = dd.from_pandas(some_rows, npartitions=14)

# def apply_extract_landmarks(df):
#     return df.apply(extract_landmarks, axis=1)


# from dask.diagnostics import ProgressBar
# with ProgressBar():
#     result = dask_df.map_partitions(apply_extract_landmarks).compute()



# landmarks_data = some_rows.parallel_apply(extract_landmarks, axis = 1)


# def parallel_extract_landmarks(df):
#     with ThreadPoolExecutor() as executor:
#         results = list(executor.map(extract_landmarks, [row for _, row in df.iterrows()]))
#     return pd.DataFrame(results)



# results_df = parallel_extract_landmarks(some_rows)







# some_rows.progress_apply(extract_landmarks, axis = 1)

# with ThreadPoolExecutor(max_workers=20) as executor:
#     futures = {executor.submit(extract_landmarks, row): index for index, row in some_rows.iterrows()}
#     landmarks_data = []

#     for future in tqdm(futures, desc="Collecting results"):
#         result = future.result()
#         landmarks_data.append(result)
#         # logging.info(f"Collected landmarks for gloss: {result[0]}, instance_id: {result[1]}")

# landmarks_df = pd.DataFrame(landmarks_data, columns=["gloss", "instance_id", "landmarks_sequence"])

# # logging.info(landmarks_df)

# merged_df = pd.merge(df, landmarks_df, on=['gloss', 'instance_id'], how='inner')

# # Release all video captures after processing
# for cap in video_capture_cache.values():
#     cap.release()


# holistic.close()
# gc.collect()

100%|██████████| 5/5 [00:00<00:00, 19222.29it/s]
I0000 00:00:1730615139.299524   31125 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1730615139.381722   31168 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 550.120), renderer: NVIDIA GeForce RTX 4050 Laptop GPU/PCIe/SSE2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
I0000 00:00:1730615139.406413   31126 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
W0000 00:00:1730615139.447420   31148 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1730615139.447672   31190 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 550.120), renderer: NVIDIA GeForce RTX 4050 Laptop GPU/PCIe/SSE2
I0000 00:00:1730615139.468508   31127 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
W0000 00:00:1730615139.510810   31175 inference_feedback_manager.c

[{'face': [(0.457228422164917, 0.2590799927711487, -0.04655074700713158), (0.4581688940525055, 0.22486723959445953, -0.06679660081863403), (0.45836469531059265, 0.23847927153110504, -0.040016062557697296), (0.45060452818870544, 0.1938411146402359, -0.04195578023791313), (0.4587770402431488, 0.2140408307313919, -0.06892367452383041), (0.46005257964134216, 0.2016228437423706, -0.060939498245716095), (0.46338701248168945, 0.1734570413827896, -0.018336301669478416), (0.37149977684020996, 0.1770465075969696, 0.03383644297719002), (0.46590176224708557, 0.14757557213306427, -0.003255711402744055), (0.4670655429363251, 0.13269296288490295, -0.001830432447604835), (0.47002822160720825, 0.08461011946201324, 0.024974409490823746), (0.4569200277328491, 0.26401612162590027, -0.0459967702627182), (0.4569314420223236, 0.2682766616344452, -0.04264410212635994), (0.4571228623390198, 0.2702941298484802, -0.03781983628869057), (0.4578271508216858, 0.2706766128540039, -0.04057595878839493), (0.45767253637

In [23]:
landmarks_df = pd.DataFrame(results, columns=["gloss","instance_id", "landmarks_sequence"])

In [24]:
landmarks_df

Unnamed: 0,gloss,instance_id,landmarks_sequence
0,book,0,"[{'face': [(0.5075485110282898, 0.305947214365..."
1,book,10,"[{'face': [(0.457228422164917, 0.2590799927711..."
2,book,17,"[{'face': [(0.5032461881637573, 0.255477875471..."
3,book,22,"[{'face': [(0.49740689992904663, 0.28395879268..."
4,book,24,"[{'face': [(0.4821474254131317, 0.232690572738..."


In [25]:
merged_df = pd.merge(df, landmarks_df, on=['gloss', 'instance_id'], how='inner')  # You can change 'inner' to 'outer', 'left', or 'right' depending on the merge type you want.

In [26]:
merged_df

Unnamed: 0,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,split,variation_id,video_id,is_available,landmarks_sequence
0,book,"[385, 37, 885, 720]",25,-1,1,0,118,train,0,69241,True,"[{'face': [(0.5075485110282898, 0.305947214365..."
1,book,"[462, 44, 949, 720]",25,-1,1,10,31,train,0,7069,True,"[{'face': [(0.457228422164917, 0.2590799927711..."
2,book,"[234, 17, 524, 414]",25,-1,1,17,36,train,0,7068,True,"[{'face': [(0.5032461881637573, 0.255477875471..."
3,book,"[131, 26, 526, 480]",25,-1,1,22,59,train,0,7070,True,"[{'face': [(0.49740689992904663, 0.28395879268..."
4,book,"[162, 54, 528, 400]",25,-1,1,24,12,val,0,7099,True,"[{'face': [(0.4821474254131317, 0.232690572738..."


In [None]:
# cap = cv2.VideoCapture(0)

# with mp_holistic.Holistic(static_image_mode=False, 
#                            model_complexity=2, 
#                            enable_segmentation=True,
#                            min_detection_confidence=0.5,
#                            min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened():
#         ret,frame = cap.read()
#         if not ret:
#             print("Ignoring empty camera frame")
#             continue

#         frame = cv2.flip(frame, 1)
        
#         image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#         image.flags.writeable = False

#         results = holistic.process(image)

#         image.flags.writeable = True
#         image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

#         if results.face_landmarks:
#              mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS)
#         # if results.pose_landmarks:
#         #      mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
#         if results.left_hand_landmarks:
#              mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
#         if results.right_hand_landmarks:
#              mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

#         cv2.imshow('TEST', image)

#         if cv2.waitKey(5) & 0xFF == 27:  # Press 'Esc' to exit
#                 break

# cap.release()
# cv2.destroyAllWindows()
