In [26]:
import concurrent.futures
import os
import shutil
import time

import cv2
import mediapipe as mp
import numpy as np
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from moviepy.editor import VideoFileClip, ImageSequenceClip

#### Setting Task vision

documentations for hand :
- https://ai.google.dev/edge/mediapipe/solutions/vision/hand_landmarker/python#configuration_options
- https://github.com/google-ai-edge/mediapipe-samples/blob/main/examples/hand_landmarker/python/hand_landmarker.ipynb

documentations for pose :
- https://ai.google.dev/edge/mediapipe/solutions/vision/pose_landmarker#configurations_options
- https://github.com/google-ai-edge/mediapipe-samples/blob/main/examples/pose_landmarker/python/%5BMediaPipe_Python_Tasks%5D_Pose_Landmarker.ipynb

In [27]:
drawer = mp.solutions.drawing_utils  # Drawing utilities
VisionRunningMode = mp.tasks.vision.RunningMode

# base options for hand and pose detection models
hand_base_options = python.BaseOptions(
    model_asset_path="./tasks/hand_landmarker.task"
)
pose_base_options = python.BaseOptions(
    model_asset_path="./tasks/pose_landmarker.task"
)

# options for hand detection
hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    min_hand_detection_confidence=0.6,
    min_hand_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# options for pose detection
pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    min_pose_detection_confidence=0.6,
    min_pose_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# create detectors
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

#### Setting up for dataset preprocessing

Reference for how to do the sign language in [youtube](https://www.youtube.com/watch?v=0FcwzMq4iWg)

In [28]:
# the dataset path for saving the preprocessed raw data (video)
DATASET_PATH = os.path.join("../storage/datasets/cleaned")
DATASET_PATH_RAW = os.path.join("../storage/datasets/archive/raw")

# action lables
ACTIONS = [
    "_", "hello", "what's up", "how",
    "thanks", "you", "morning", "afternoon",
    "night", "me", "name", "fine",
    "happy", "yes", "no", "repeat",
    "please", "want", "good bye", "learn",
]

# limit to x actions for preprocessing
# NOTE: change this number into the amount of the dataset labels (if changed)
# ACTIONS = ACTIONS[15:16]

# number of videos and actions per video
videos_per_label = 120 # change it to how many videos you have
frames_per_video = 60

In [29]:
ACTIONS

['repeat']

In [30]:
# create dataset directories if they do not exist
try:
    try:
        shutil.rmtree(DATASET_PATH)
    except:
        pass

    os.makedirs(DATASET_PATH)
except FileExistsError:
    print("Dataset folder exists, skipping creation")
    print("========================================")

for action in ACTIONS:
    os.makedirs(os.path.join(DATASET_PATH, action))

    print(f"[CREATED] {action}")

[CREATED] repeat


#### Setting up for extracting the Mediapipe Landmaker

##### Landmarker (drawing)

In [31]:
LandmarkList = landmark_pb2.NormalizedLandmarkList  # aliases for landmark types
NormalizedLandmark = landmark_pb2.NormalizedLandmark  # aliases for landmark types


def to_landmark_list(landmarks):
    """
    Create a LandmarkList from a list of landmarks or fill with empty values if no landmarks are provided.
    """
    return LandmarkList(
        landmark=([NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks])
    )


empty_pose_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(33 * 3)]
)

empty_hand_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(21 * 3)]
)


def to_drawing_landmark(hand_results, pose_results):
    """
    Convert pose and hand landmarks to LandmarkList for drawing.
    """

    pose_landmarks = (
        to_landmark_list(pose_results.pose_landmarks[0])
        if pose_results.pose_landmarks
        else empty_pose_landmarks
    )

    hand_landmarks = [empty_hand_landmarks, empty_hand_landmarks]

    if not hand_results:
        return pose_landmarks, None

    # iterate over the detected hand landmarks
    for index, hand_landmark in enumerate(hand_results.hand_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmarks[handedness] = to_landmark_list(hand_landmark)

    return hand_landmarks, pose_landmarks


def draw_landmark(image, hand_landmarks, pose_landmarks):
    """
    Draw detected landmarks on the image.
    """
    drawer.draw_landmarks(
        image,
        pose_landmarks,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    if not hand_landmarks:
        return

    for hand_landmarks in hand_landmarks:
        drawer.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

##### Landmarker (dataset)

In [32]:
empty_hand_landmark = np.zeros((2, 21, 3))  # right hand and left hand
empty_pose_landmark = np.zeros(33 * 3)


def to_landmark_data(
    hand_results: vision.HandLandmarkerResult, pose_results: vision.PoseLandmarkerResult
):
    """
    Extract keypoints from pose and hand results for dataset creation.
    """
    pose_landmark = empty_pose_landmark
    hand_landmark = empty_hand_landmark

    if pose_results.pose_world_landmarks:
        pose_landmark = np.array(
            [[lm.x, lm.y, lm.z] for lm in pose_results.pose_world_landmarks[0]]
        ).flatten()

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate([pose_landmark, hand_landmark.flatten()])

    # iterate over the detected hand landmarks
    for index, hlm in enumerate(hand_results.hand_world_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmark[handedness] = np.array([[lm.x, lm.y, lm.z] for lm in hlm])

    return np.concatenate([pose_landmark, hand_landmark.flatten()])

##### Saving the landmarker data

In [33]:
def save_cleaned_landmark(action: str, sequence: int, keypoints: np.ndarray):
    np_path = os.path.join(DATASET_PATH, action, str(sequence))

    np.save(np_path, keypoints)

#### Read the raw data and process it using mediapipe

In [34]:
image_landmark_debug = False


def save_video_with_landmark(image_list, fps, label, video_num):
    images = [image for _, image, _ in image_list]

    new_clip = ImageSequenceClip(images, fps=fps)

    output_path = f"../storage/datasets/debug/{label}_{video_num}.mp4"

    new_clip.write_videofile(output_path, codec="libx264")

In [35]:
def process_frame(label, video_num, frame, image, flip_frame, is_debug):
    # start time for performance tracking
    start_time = time.time()

    # flip the image horizontally for a selfie-view display
    if flip_frame: image = np.fliplr(image)

    try:
        image = image.astype(np.uint8)

        # convert image to mediapipe image format
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

        # detect hands and pose
        hand_results = hand_detector.detect(image=mp_image)

        pose_results = pose_detector.detect(image=mp_image)

        # convert results to landmarks
        keypoints = to_landmark_data(hand_results, pose_results)

        if is_debug:
            hand, pose = to_drawing_landmark(hand_results, pose_results)
            draw_landmark(image, hand, pose)

    except Exception as e:
        print(f"Error processing {label} video {video_num} frame {frame}: {e}")
        return frame, None, None, time.time() - start_time

    if is_debug:
        return frame, image, keypoints, time.time() - start_time

    return frame, None, keypoints, time.time() - start_time


def process_video(label: str, video_num: int, flip_frame: bool, video_start: int = 0):
    video_path = os.path.join(DATASET_PATH_RAW, label, f"{video_num}.avi")

    video_counter = video_start

    try:
        clip = VideoFileClip(video_path)
    except OSError:
        print(f"Error: Could not open video file {video_path}")
        return f"[{label}] ({video_num}) error opening video file"

    avg_exec_time = []
    results = []

    # use ThreadPoolExecutor to process frames concurrently
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_frame = {
            executor.submit(
                process_frame,
                label,
                video_num,
                frame,
                image,
                flip_frame,
                image_landmark_debug
            ): frame
            for frame, image in enumerate(clip.iter_frames(fps=clip.fps))
        }

        for future in concurrent.futures.as_completed(future_to_frame):
            frame, image, keypoints, exec_time = future.result()

            if keypoints is not None:
                results.append((frame, image, keypoints))

            avg_exec_time.append(exec_time)

    start_time = time.time()
    results.sort(key=lambda x: x[0])

    if image_landmark_debug:
        save_video_with_landmark(results, clip.fps, label, video_num)

    if len(results) == 60:
        # combine all landmark sequences into a single numpy array
        keypoints = np.array([landmark for _, _, landmark in results])

        save_cleaned_landmark(label, video_counter, keypoints)

        end_time = time.time() - start_time
        avg_exec_time = {
            "avg_video_exec": avg_exec_time,
            "save_exec": end_time,
        }

        return avg_exec_time, label, video_num, frame

    return None

In [36]:
actions = [
    {"action": word, "frame": i} for word in ACTIONS for i in range(videos_per_label)
]

log = []

In [37]:
all_first_5 = [actions[i: i + 5] for i in range(0, len(actions), videos_per_label)]
all_first_1 = [actions[i] for i in range(0, len(actions), videos_per_label)]

In [38]:
image_landmark_debug

False

In [39]:
def process_action_frame(label, frame, log):
    print(f"[{label}]\t{frame}")
    result1 = process_video(label, frame, False, frame)  # without flip

    # the flip image should start at 120th video
    # so in the end we have 240 data on each action
    result2 = process_video(label, frame, True, videos_per_label + frame)  # with fip image

    if not result1 or not result2:
        return None

    log.append([result1, result2])

    return log


with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_action = {
        executor.submit(process_action_frame, action["action"], action["frame"], log): action
        for action in actions
    }

    for future in concurrent.futures.as_completed(future_to_action):
        log = future.result()

        if not log:
            break

[repeat]	0
[repeat]	1
[repeat]	2
[repeat]	3
[repeat]	4
[repeat]	5
[repeat]	6
[repeat]	7
[repeat]	8
[repeat]	9
[repeat]	10
[repeat]	11
[repeat]	12
[repeat]	13
[repeat]	14
[repeat]	15
[repeat]	16
[repeat]	17
[repeat]	18
[repeat]	19
[repeat]	20
[repeat]	21
[repeat]	22
[repeat]	23
[repeat]	24
[repeat]	25
[repeat]	26
[repeat]	27
[repeat]	28
[repeat]	29
[repeat]	30
[repeat]	31
[repeat]	32
[repeat]	33
[repeat]	34
[repeat]	35
[repeat]	36
[repeat]	37
[repeat]	38
[repeat]	39
[repeat]	40
[repeat]	41
[repeat]	42
[repeat]	43
[repeat]	44
[repeat]	45
[repeat]	46
[repeat]	47
[repeat]	48
[repeat]	49
[repeat]	50
[repeat]	51
[repeat]	52
[repeat]	53
[repeat]	54
[repeat]	55
[repeat]	56
[repeat]	57
[repeat]	58
[repeat]	59
[repeat]	60
[repeat]	61
[repeat]	62
[repeat]	63
[repeat]	64
[repeat]	65
[repeat]	66
[repeat]	67
[repeat]	68
[repeat]	69
[repeat]	70
[repeat]	71
[repeat]	72
[repeat]	73
[repeat]	74
[repeat]	75
[repeat]	76
[repeat]	77
[repeat]	78
[repeat]	79
[repeat]	80
[repeat]	81
[repeat]	82
[repeat]	83
[r