<a href="https://colab.research.google.com/github/ValentinaEmili/Sign_language/blob/main/hands_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mediapipe==0.10.5

In [None]:
# mount google drive on colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import cv2
from google.colab.patches import cv2_imshow
from tqdm import tqdm
import mediapipe as mp
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import os

In [None]:
# load the JSON data
js_file = pd.read_json("/content/drive/MyDrive/NLP/WLASL_v0.3.json")
folder = "/content/drive/MyDrive/NLP/dataset/"

training_folder = folder + "train/"
validation_folder = folder + "val/"
test_folder = folder + "test/"

training_video = training_folder + "video/"
validation_video = validation_folder + "video/"
test_video = test_folder + "video/"

training_images = training_folder + "images/"
validation_images = validation_folder + "images/"
test_images = test_folder + "images/"

youtube_videos = ['asl5200', 'asllex', 'aslu', 'lillybauer', 'nabboud', 'northtexas', 'scott', 'valencia-asl']

Holistic model can detect pose, hands and face features.
-  pose: 33 keypoints, we will save the landmarks (x, y, z, visibility)
- left hand: 21 keypoints, we will save the landmarks (x, y, z)
- right hand: 21 keypoints, we will save the landmarks (x, y, z)
- face (not used): 468 keypoints


In [None]:
# holistic model

mp_holistic = mp.solutions.holistic

for i, word in enumerate(tqdm(list(js_file['gloss']), desc='glosses')):
  for j, instance in enumerate(js_file['instances'][i]):
    video_id = js_file['instances'][i][j]['video_id']
    source = js_file['instances'][i][j]['source']
    split = js_file['instances'][i][j]['split']
    frame_end = js_file['instances'][i][j]['frame_end']
    frame_start = js_file['instances'][i][j]['frame_start']
    filename = f"{word}_{video_id}.mp4"

    source_path = training_video if split == 'train' else validation_video if split == 'val' else test_video
    dest_path = training_images if split == 'train' else validation_images if split == 'val' else test_images
    os.makedirs(dest_path, exist_ok=True)

    # skip videos with broken url
    if filename not in os.listdir(source_path):
      continue

    if source not in youtube_videos and dest_file not in os.listdir(dest_path):
      dest_file = f"{word}_{video_id}.npy"

      cap = cv2.VideoCapture(os.path.join(source_path, filename))

      total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
      frame_end = frame_end if frame_end !=-1 else total_frames
      all_frame_features = []
      curr_frame = 0

      with mp_holistic.Holistic(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as holistic:

        while cap.isOpened():
          ret, frame = cap.read()
          if not ret:
            break

          if curr_frame < frame_start:
            curr_frame += 1
            continue

          if curr_frame > frame_end:
             break
          frame_features = []

          # to improve performance, optionally mark the image as not writable to pass by inference
          frame.flags.writeable = False
          image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
          results = holistic.process(image)

          # save landmarks
          if results.pose_landmarks:
            for landmark in results.pose_landmarks.landmark:
              frame_features.extend([landmark.x, landmark.y, landmark.z, landmark.visibility])
          else:
            frame_features.extend([0.0]*132) # 33 points, each with 4 values (x, y, z, visibility)

          if results.left_hand_landmarks:
            for landmark in results.left_hand_landmarks.landmark:
              frame_features.extend([landmark.x, landmark.y, landmark.z])
          else:
            frame_features.extend([0.0]*63) # points, each with 3 values (x, y, z)

          if results.right_hand_landmarks:
            for landmark in results.right_hand_landmarks.landmark:
              frame_features.extend([landmark.x, landmark.y, landmark.z])
          else:
            frame_features.extend([0.0]*63) # points, each with 3 values (x, y, z)
          if sum(frame_features) > 0:
            all_frame_features.append(frame_features) # for the whole video

          curr_frame += 1

        cap.release()

        np.save(os.path.join(dest_path, dest_file), np.array(all_frame_features))

Without youtube videos:
- training set: 6537 videos
- validation set: 1787 videos
- test set: 1254 videos

Training gloss: 1980 words (still to remove words in test and val that are not in training set)

With youtube videos:
- training set: 8760 videos
- validation set: 2493 videos
- test set: 2060 videos

Training gloss: 1999 words. they're missing 'boots' and 'wash face' from all the three sets