# Data Pre-processing and Preparation

#### 1. Importing Final Dataset from MongoDB

In [1]:
from pymongo import MongoClient
import pandas as pd
import cv2
from tqdm import tqdm
import gc
tqdm.pandas() #initialize tqdm for pandas

In [2]:
# Establishing connection 

connection = MongoClient('localhost' , 27017)
db  = connection['mydb']
collection = db['Sign_Language_Final_Data']

In [3]:
# Extracting the data

cursor = collection.find({})
final_sign_df = pd.DataFrame(list(cursor))

In [4]:
final_sign_df.head()

Unnamed: 0,_id,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,source,split,url,variation_id,video_id,is_available
0,671b7bc2c6201c92805b4f99,book,"[385, 37, 885, 720]",25,-1,1,0,118,aslbrick,train,http://aslbricks.org/New/ASL-Videos/book.mp4,0,v_id_69241,True
1,671b7bc2c6201c92805b4f9a,book,"[462, 44, 949, 720]",25,-1,1,10,31,signschool,train,https://signstock.blob.core.windows.net/signsc...,0,v_id_07069,True
2,671b7bc2c6201c92805b4f9b,book,"[234, 17, 524, 414]",25,-1,1,17,36,startasl,train,https://s3-us-west-1.amazonaws.com/files.start...,0,v_id_07068,True
3,671b7bc2c6201c92805b4f9c,book,"[131, 26, 526, 480]",25,-1,1,22,59,asldeafined,train,https://media.asldeafined.com/vocabulary/14666...,0,v_id_07070,True
4,671b7bc2c6201c92805b4f9d,book,"[162, 54, 528, 400]",25,-1,1,24,12,aslsearch,val,http://www.aslsearch.com/signs/videos/book.mp4,0,v_id_07099,True


In [5]:

 #Removing v_id_ from v_id so that the path can be matched up
final_sign_df['video_id'] = final_sign_df['video_id'].apply(lambda id : id.replace('v_id_',''))

In [6]:
final_sign_df

Unnamed: 0,_id,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,source,split,url,variation_id,video_id,is_available
0,671b7bc2c6201c92805b4f99,book,"[385, 37, 885, 720]",25,-1,1,0,118,aslbrick,train,http://aslbricks.org/New/ASL-Videos/book.mp4,0,69241,True
1,671b7bc2c6201c92805b4f9a,book,"[462, 44, 949, 720]",25,-1,1,10,31,signschool,train,https://signstock.blob.core.windows.net/signsc...,0,07069,True
2,671b7bc2c6201c92805b4f9b,book,"[234, 17, 524, 414]",25,-1,1,17,36,startasl,train,https://s3-us-west-1.amazonaws.com/files.start...,0,07068,True
3,671b7bc2c6201c92805b4f9c,book,"[131, 26, 526, 480]",25,-1,1,22,59,asldeafined,train,https://media.asldeafined.com/vocabulary/14666...,0,07070,True
4,671b7bc2c6201c92805b4f9d,book,"[162, 54, 528, 400]",25,-1,1,24,12,aslsearch,val,http://www.aslsearch.com/signs/videos/book.mp4,0,07099,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11975,671b7bc2c6201c92805b7e60,wheelchair,"[39, 13, 248, 192]",25,-1,1,5,11,signingsavvy,train,https://www.signingsavvy.com/signs/mp4/5/5233.mp4,0,63047,True
11976,671b7bc2c6201c92805b7e61,wheelchair,"[163, 62, 625, 400]",25,-1,1,8,12,aslsearch,train,http://www.aslsearch.com/signs/videos/wheelcha...,0,63050,True
11977,671b7bc2c6201c92805b7e62,whistle,"[76, 17, 236, 240]",25,-1,1,2,2,spreadthesign,train,https://media.spreadthesign.com/video/mp4/13/9...,0,63186,True
11978,671b7bc2c6201c92805b7e63,whistle,"[68, 14, 212, 192]",25,-1,1,4,11,signingsavvy,train,https://www.signingsavvy.com/signs/mp4/9/9961.mp4,0,63188,True


In [7]:
len(final_sign_df)

11980

2. Extracting Frames from each video 
 - Convert the video into a sequence of frames that can be processed individually

In [8]:
def preprocess_video(video_id , label):
 
 video_data_dict ={}
 
 video_path = f'C:\\Users\\Sahil\\Desktop\\Talkwithhands dataset\\versions\\5\\videos{video_id}.mp4'
 cap = cv2.VideoCapture(video_path)

 # Verify if the video path is correct and the video can be opened
 if not cap.isOpened():
    print(f"Failed to open video: {video_path}")
    return None

 frames = []

 # Extract Frames 

 while True:

    ret , frame = cap.read()

    if not ret:
     
         break

    # Resizing the frames 

    frame = cv2.resize(frame , (112 , 112))

    #Normalizing the frames 
    frame = frame.astype('float32') / 255.0

    frames.append(frame) 
 
 cap.release()

 #store the frames and label in a dictionary

 current_video_data = {
    'frames': frames,
    'label' : label
 }
 
#  video_data_dict[video_id] = current_video_data
 

 return current_video_data


: 

In [9]:
# Processing In Batches to prvent kernel crashes
all_video_data = [] # list to collect all the batch video data

batch_size = 100

num_batches = len(final_sign_df) // batch_size + 1

for i in tqdm(range(num_batches)):
 batch_df = final_sign_df.iloc[i * batch_size : (i + 1) * batch_size]

  # Process the batch and collect results
 video_data_batch = batch_df.progress_apply(lambda record: preprocess_video(record['video_id'], record['gloss']), axis=1)

 # Append the results to the list
 all_video_data.append(video_data_batch.tolist()) # convert series to list

 # Clear variables and run garbage collector
 del batch_df, video_data_batch
 gc.collect()
 


100%|██████████| 100/100 [00:10<00:00,  9.86it/s]
100%|██████████| 100/100 [00:07<00:00, 12.81it/s]
100%|██████████| 100/100 [00:08<00:00, 11.71it/s]
100%|██████████| 100/100 [00:08<00:00, 12.02it/s]
100%|██████████| 100/100 [00:10<00:00,  9.53it/s]
100%|██████████| 100/100 [00:10<00:00,  9.83it/s]
100%|██████████| 100/100 [00:11<00:00,  9.00it/s]
100%|██████████| 100/100 [00:09<00:00, 10.15it/s]
100%|██████████| 100/100 [00:11<00:00,  8.64it/s]
100%|██████████| 100/100 [00:11<00:00,  8.65it/s]
100%|██████████| 100/100 [00:13<00:00,  7.52it/s]
100%|██████████| 100/100 [00:10<00:00,  9.15it/s]
100%|██████████| 100/100 [00:11<00:00,  8.34it/s]
100%|██████████| 100/100 [00:13<00:00,  7.34it/s]
100%|██████████| 100/100 [00:12<00:00,  7.98it/s]
100%|██████████| 100/100 [00:12<00:00,  8.22it/s]
100%|██████████| 100/100 [00:15<00:00,  6.39it/s]
100%|██████████| 100/100 [00:15<00:00,  6.64it/s]
100%|██████████| 100/100 [00:13<00:00,  7.42it/s]
100%|██████████| 100/100 [00:16<00:00,  5.95it/s]


In [None]:
# # Testing the above function 

# video_data_list = final_sign_df.progress_apply(lambda record : preprocess_video(record['video_id'] , record['gloss'])  , axis=1)


  0%|          | 0/11980 [00:00<?, ?it/s]

 17%|█▋        | 2086/11980 [04:31<17:06,  9.64it/s]

3. Converting Into Dataframe

In [26]:
extracted_frames_df = pd.DataFrame.from_dict(vid_data , orient='index').reset_index()

In [27]:
extracted_frames_df

Unnamed: 0,index,frames,label
0,69241,"[[[[0.37254903 0.38039216 0.3882353 ], [0.3686...",book


In [12]:
dict = {
  'student1' : {
   'name' : 'sahil',
   'age' : 18
  } ,

  'student2' : {
   'name' : 'salina',
   'age' : 17
  } ,
}

In [13]:
students = pd.DataFrame.from_dict(dict , orient='index').reset_index()

In [14]:
students

Unnamed: 0,index,name,age
0,student1,sahil,18
1,student2,salina,17
