# Data Pre-processing and Preparation

#### 1. Importing Final Dataset from MongoDB

In [1]:
from pymongo import MongoClient
import cv2
from tqdm import tqdm
import gc
tqdm.pandas() #initialize tqdm for pandas
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# import cudf.pandas
# cudf.pandas.install()
import pandas as pd


In [2]:
# Establishing connection 

connection = MongoClient('localhost' , 27017)
db  = connection['mydb']
collection = db['Sign_Language_Final_Data']

In [3]:
# Extracting the data

cursor = collection.find({})
final_sign_df = pd.DataFrame(list(cursor))

In [4]:
final_sign_df.head()

Unnamed: 0,_id,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,source,split,url,variation_id,video_id,is_available
0,671b7bc2c6201c92805b4f99,book,"[385, 37, 885, 720]",25,-1,1,0,118,aslbrick,train,http://aslbricks.org/New/ASL-Videos/book.mp4,0,v_id_69241,True
1,671b7bc2c6201c92805b4f9a,book,"[462, 44, 949, 720]",25,-1,1,10,31,signschool,train,https://signstock.blob.core.windows.net/signsc...,0,v_id_07069,True
2,671b7bc2c6201c92805b4f9b,book,"[234, 17, 524, 414]",25,-1,1,17,36,startasl,train,https://s3-us-west-1.amazonaws.com/files.start...,0,v_id_07068,True
3,671b7bc2c6201c92805b4f9c,book,"[131, 26, 526, 480]",25,-1,1,22,59,asldeafined,train,https://media.asldeafined.com/vocabulary/14666...,0,v_id_07070,True
4,671b7bc2c6201c92805b4f9d,book,"[162, 54, 528, 400]",25,-1,1,24,12,aslsearch,val,http://www.aslsearch.com/signs/videos/book.mp4,0,v_id_07099,True


In [5]:

 #Removing v_id_ from v_id so that the path can be matched up
final_sign_df['video_id'] = final_sign_df['video_id'].apply(lambda id : id.replace('v_id_',''))

In [6]:
final_sign_df

Unnamed: 0,_id,gloss,bbox,fps,frame_end,frame_start,instance_id,signer_id,source,split,url,variation_id,video_id,is_available
0,671b7bc2c6201c92805b4f99,book,"[385, 37, 885, 720]",25,-1,1,0,118,aslbrick,train,http://aslbricks.org/New/ASL-Videos/book.mp4,0,69241,True
1,671b7bc2c6201c92805b4f9a,book,"[462, 44, 949, 720]",25,-1,1,10,31,signschool,train,https://signstock.blob.core.windows.net/signsc...,0,07069,True
2,671b7bc2c6201c92805b4f9b,book,"[234, 17, 524, 414]",25,-1,1,17,36,startasl,train,https://s3-us-west-1.amazonaws.com/files.start...,0,07068,True
3,671b7bc2c6201c92805b4f9c,book,"[131, 26, 526, 480]",25,-1,1,22,59,asldeafined,train,https://media.asldeafined.com/vocabulary/14666...,0,07070,True
4,671b7bc2c6201c92805b4f9d,book,"[162, 54, 528, 400]",25,-1,1,24,12,aslsearch,val,http://www.aslsearch.com/signs/videos/book.mp4,0,07099,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11975,671b7bc2c6201c92805b7e60,wheelchair,"[39, 13, 248, 192]",25,-1,1,5,11,signingsavvy,train,https://www.signingsavvy.com/signs/mp4/5/5233.mp4,0,63047,True
11976,671b7bc2c6201c92805b7e61,wheelchair,"[163, 62, 625, 400]",25,-1,1,8,12,aslsearch,train,http://www.aslsearch.com/signs/videos/wheelcha...,0,63050,True
11977,671b7bc2c6201c92805b7e62,whistle,"[76, 17, 236, 240]",25,-1,1,2,2,spreadthesign,train,https://media.spreadthesign.com/video/mp4/13/9...,0,63186,True
11978,671b7bc2c6201c92805b7e63,whistle,"[68, 14, 212, 192]",25,-1,1,4,11,signingsavvy,train,https://www.signingsavvy.com/signs/mp4/9/9961.mp4,0,63188,True


In [7]:
len(final_sign_df)

11980

2. Defining Function to Extract certain number of frames from the video 
 - Convert the video into a sequence of frames that can be processed individually

In [8]:
def format_frames(frame , output_size):

   # We do not need to explicitly normalize by using frame /255.0 as when converting values of pixel present in int ranging from 0 to 255 , when converting to tf.float32 it scales the values between [0 , 1]

   frame = tf.image.convert_image_dtype(frame , tf.float32)
   frame = tf.image.resize_with_pad(frame , *output_size)
   return frame

In [9]:
def check_frame_values(frame , frame_number):
  
  if np.all(frame == 0):
    
    print(f"zeros at frame : {frame_number}")
  
  else:
    
    if np.any((frame > 0) & (frame <=1)):
      
      print(f'frame {frame_number} contains valid frames')
    
    else:
      
      print(f'frame {frame_number} contains invalid frames')
      


In [10]:
def frames_from_video(video_id , n_frames ,v_dict, label, output_size = (224,224) , frame_step = 4 ):
  
  video_path = rf'C:\Users\Sahil\Desktop\Talkwithhands dataset\versions\5\videos\{video_id}.mp4'

  result = []
  src = cv2.VideoCapture(str(video_path))

  #verify if the video path is correct and the video can be opened

  if not src.isOpened():
    print(f"Failed to open video: {video_path}")
    return None
  
  
  # Extracting total video frames 
  video_length  = src.get(cv2.CAP_PROP_FRAME_COUNT)

  need_length = 1 + (n_frames - 1) * frame_step

  # Checking whether video has enough frames as required

  if need_length > video_length:
     
     start = 0
  
  else:
    
    max_start = video_length - need_length
    start = np.random.randint(0 , max_start + 1)

  # Setting the video to the starting frame - start

  src.set(cv2.CAP_PROP_POS_FRAMES , start)

  # ret is a boolean that returns TRUE if frame was read successfully and frame is the image data

 # capturing the first frame
  ret , frame = src.read()
  result.append(format_frames(frame , output_size))

 # Capturig further more frames

  for i in range(n_frames - 1): # first frame already taken

    for _ in range(frame_step):

      ret , frame = src.read()
    
    if ret :

      frame = format_frames(frame , output_size)
      result.append(frame)

    else:
      result.append(np.zeros_like(result[0])) #If no more frames present it inserts zeros with the same shape as the first frame
  
  src.release()
  
  # The [2, 1, 0] indexing reverses the color channels from BGR to RGB (as OpenCV loads images in BGR format, but many libraries expect RGB).

  result = np.array(result)[... , [2,1,0]] 

  dict_element = {
    'frames_data' : result,
    'label' : label
  }

  v_dict[video_id] = dict_element 



  
  
    


  


In [11]:
# def preprocess_video(video_id):
 
#  video_data_dict ={}
 
#  video_path = rf'C:\Users\Sahil\Desktop\Talkwithhands dataset\versions\5\videos\{video_id}.mp4'
#  cap = cv2.VideoCapture(video_path)

#  # Verify if the video path is correct and the video can be opened
#  if not cap.isOpened():
#     print(f"Failed to open video: {video_path}")
#     return None

#  frames = []

#  # Extract Frames 

#  while True:

#     ret , frame = cap.read()

#     if not ret:
     
#          break

#     # Resizing the frames 

#     frame = cv2.resize(frame , (112 , 112))

#     #Normalizing the frames 
#     frame = frame.astype('float32') / 255.0

#     frames.append(frame) 
 
#  cap.release()
# #  video_data_dict[video_id] = current_video_data
 

#  return frames



3. Extracting Frames from each video present in the dataset

In [12]:
# # Processing In Batches to prvent kernel crashes
# all_video_data = [] # list to collect all the batch video data

# batch_size = 20

# num_batches = len(final_sign_df) // batch_size + 1

# for i in tqdm(range(num_batches)):
#  batch_df = final_sign_df.iloc[i * batch_size : (i + 1) * batch_size]

#   # Process the batch and collect results
#  video_data_batch = batch_df.progress_apply(lambda record: preprocess_video(record['video_id']), axis=1)


#  video_batch_dict = [{'video_id' : record['video_id'] , 'label' : record['gloss'] , 'video_data' : [frame.tolist() for frame in data]} 
#                      for record , data in zip(batch_df.to_dict(orient='records') , video_data_batch)]
 

#  # Append the results to the list
#  # all_video_data.append(video_data_batch.tolist()) # convert series to list

#  # Clear variables and run garbage collector
#  del batch_df, video_data_batch , video_batch_dict
#  gc.collect()

In [13]:
# Intializing empty dict to store each frame as a entry to the dictionary
video_frames_dict = []

# Extracting frames from videos using batches to prevent kernel crashes

batch_size = 1000

num_batches = len(final_sign_df) // batch_size + 1

for i in (range(num_batches)):

  batch_df = final_sign_df.iloc[i * batch_size : (i + 1)*batch_size ]

  # Initialiing a dictionary for the current batch
  batch_dict = {}

 # Process the batch and and add it to the dictionary

  batch_df.progress_apply(lambda record : frames_from_video(video_id= record['video_id'] , n_frames = int(15) , v_dict= batch_dict ,label = record['gloss']) , axis = 1 )

  # add current batch dictionary to the list

  video_frames_dict.append(batch_dict)

  # Clear batch dictionary and collected garbage to free memory
  batch_dict.clear()
  batch_df = None
  gc.collect()






 69%|██████▊   | 686/1000 [02:15<01:01,  5.07it/s]


error: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\core\src\alloc.cpp:73: error: (-4:Insufficient memory) Failed to allocate 6220800 bytes in function 'cv::OutOfMemoryError'


: 

In [12]:
# # Intializing empty dict to store each frame as a entry to the dictionary
# video_frames_dict = {}

# # Applying to each video in the dataframe

# final_sign_df.progress_apply(lambda record : frames_from_video(video_id= record['video_id'] , n_frames = int(30) , v_dict= video_frames_dict ,label = record['gloss']) , axis = 1 )

 10%|█         | 1217/11980 [05:15<46:27,  3.86it/s]  


MemoryError: Unable to allocate 17.2 MiB for an array with shape (3, 30, 224, 224) and data type float32

3. Applying Pre-processing the each video and then processing them in batches and storing each batch to mongoDB for memory management

100%|██████████| 20/20 [00:01<00:00, 11.87it/s]


Writing batch to MongoDB...
An error occurred while inserting to MongoDB: BSONObj size: 39606890 (0x25C5A6A) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('671bad4b587db12f2f498a36'), full error: {'ok': 0.0, 'errmsg': "BSONObj size: 39606890 (0x25C5A6A) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('671bad4b587db12f2f498a36')", 'code': 10334, 'codeName': 'BSONObjectTooLarge'}


100%|██████████| 20/20 [00:03<00:00,  5.58it/s]t]


Writing batch to MongoDB...
An error occurred while inserting to MongoDB: BSONObj size: 46472077 (0x2C51B8D) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('671bad5c587db12f2f498a4b'), full error: {'ok': 0.0, 'errmsg': "BSONObj size: 46472077 (0x2C51B8D) is invalid. Size must be between 0 and 16793600(16MB) First element: _id: ObjectId('671bad5c587db12f2f498a4b')", 'code': 10334, 'codeName': 'BSONObjectTooLarge'}


100%|██████████| 20/20 [00:01<00:00, 13.61it/s]t]
  0%|          | 2/600 [00:36<2:59:31, 18.01s/it]


KeyboardInterrupt: 

In [None]:
# # Testing the above function 

# video_data_list = final_sign_df.progress_apply(lambda record : preprocess_video(record['video_id'] , record['gloss'])  , axis=1)


  0%|          | 0/11980 [00:00<?, ?it/s]

 17%|█▋        | 2086/11980 [04:31<17:06,  9.64it/s]

3. Converting Into Dataframe

In [26]:
extracted_frames_df = pd.DataFrame.from_dict(vid_data , orient='index').reset_index()

In [27]:
extracted_frames_df

Unnamed: 0,index,frames,label
0,69241,"[[[[0.37254903 0.38039216 0.3882353 ], [0.3686...",book


In [12]:
dict = {
  'student1' : {
   'name' : 'sahil',
   'age' : 18
  } ,

  'student2' : {
   'name' : 'salina',
   'age' : 17
  } ,
}

In [13]:
students = pd.DataFrame.from_dict(dict , orient='index').reset_index()

In [14]:
students

Unnamed: 0,index,name,age
0,student1,sahil,18
1,student2,salina,17
