In [42]:
import concurrent.futures
import os
import shutil
import time

import mediapipe as mp
import numpy as np
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from moviepy.editor import VideoFileClip

#### Setting Task vision

documentations for hand :
- https://ai.google.dev/edge/mediapipe/solutions/vision/hand_landmarker/python#configuration_options
- https://github.com/google-ai-edge/mediapipe-samples/blob/main/examples/hand_landmarker/python/hand_landmarker.ipynb

documentations for pose :
- https://ai.google.dev/edge/mediapipe/solutions/vision/pose_landmarker#configurations_options
- https://github.com/google-ai-edge/mediapipe-samples/blob/main/examples/pose_landmarker/python/%5BMediaPipe_Python_Tasks%5D_Pose_Landmarker.ipynb

In [43]:
drawer = mp.solutions.drawing_utils  # Drawing utilities
VisionRunningMode = mp.tasks.vision.RunningMode

# base options for hand and pose detection models
hand_base_options = python.BaseOptions(
    model_asset_path="./tasks/hand_landmarker.task"
)
pose_base_options = python.BaseOptions(
    model_asset_path="./tasks/pose_landmarker.task"
)

# options for hand detection
hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    min_hand_detection_confidence=0.8,
    min_hand_presence_confidence=0.9,
    min_tracking_confidence=0.8,
    running_mode=VisionRunningMode.IMAGE,
)

# options for pose detection
pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    min_pose_detection_confidence=0.95,
    min_pose_presence_confidence=0.95,
    min_tracking_confidence=0.95,
    running_mode=VisionRunningMode.IMAGE,
)

# create detectors
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

#### Setting up for dataset preprocessing

Reference for how to do the sign language in [youtube](https://www.youtube.com/watch?v=0FcwzMq4iWg)

In [44]:
# the dataset path for saving the preprocessed raw data (video)
DATASET_PATH = os.path.join("../storage/datasets/cleaned")
DATASET_PATH_RAW = os.path.join("../storage/datasets/raw")

# action lables
ACTIONS = [
    "_", "hello", "thanks", "i-love-you", "I", "Yes", "No", "Help", "Please",
    "Want", "Eat", "More", "Bathroom", "Learn", "Sign",
]

# limit to x actions for preprocessing
# NOTE: change this number into the amount of the dataset labels (if changed)
ACTIONS = ACTIONS[:8]

# number of videos and actions per video
videos_per_label = 120 # change it to how many videos you have
frames_per_video = 60

In [45]:
# create dataset directories if they do not exist
try:
    try:
        shutil.rmtree(DATASET_PATH)
    except:
        pass

    os.makedirs(DATASET_PATH)
except FileExistsError:
    print("Dataset folder exists, skipping creation")
    print("========================================")

for action in ACTIONS:
    os.makedirs(os.path.join(DATASET_PATH, action))

    print(f"[CREATED] {action}")

[CREATED] _
[CREATED] hello
[CREATED] thanks
[CREATED] i-love-you
[CREATED] I
[CREATED] Yes
[CREATED] No
[CREATED] Help


#### Setting up for extracting the Mediapipe Landmaker

##### Landmarker (drawing)

In [46]:
LandmarkList = landmark_pb2.NormalizedLandmarkList  # aliases for landmark types
NormalizedLandmark = landmark_pb2.NormalizedLandmark  # aliases for landmark types


def to_landmark_list(landmarks):
    """
    Create a LandmarkList from a list of landmarks or fill with empty values if no landmarks are provided.
    """
    return LandmarkList(
        landmark=([NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks])
    )


empty_pose_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(33 * 3)]
)

empty_hand_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(21 * 3)]
)


def to_drawing_landmark(hand_results, pose_results):
    """
    Convert pose and hand landmarks to LandmarkList for drawing.
    """

    pose_landmarks = (
        to_landmark_list(pose_results.pose_landmarks[0])
        if pose_results.pose_landmarks
        else empty_pose_landmarks
    )

    hand_landmarks = [empty_hand_landmarks, empty_hand_landmarks]

    if not hand_results:
        return pose_landmarks, None

    # iterate over the detected hand landmarks
    for index, hand_landmark in enumerate(hand_results.hand_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmarks[handedness] = to_landmark_list(hand_landmark)

    return hand_landmarks, pose_landmarks


def draw_landmark(image, hand_landmarks, pose_landmarks):
    """
    Draw detected landmarks on the image.
    """
    drawer.draw_landmarks(
        image,
        pose_landmarks,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    if not hand_landmarks:
        return

    for hand_landmarks in hand_landmarks:
        drawer.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

##### Landmarker (dataset)

In [47]:
empty_hand_landmark = np.zeros((2, 21, 3))  # right hand and left hand
empty_pose_landmark = np.zeros(33 * 3)


def to_landmark_data(
    hand_results: vision.HandLandmarkerResult, pose_results: vision.PoseLandmarkerResult
):
    """
    Extract keypoints from pose and hand results for dataset creation.
    """
    pose_landmark = empty_pose_landmark
    hand_landmark = empty_hand_landmark

    if pose_results.pose_world_landmarks:
        pose_landmark = np.array(
            [[lm.x, lm.y, lm.z] for lm in pose_results.pose_world_landmarks[0]]
        ).flatten()

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate([pose_landmark, hand_landmark.flatten()])

    # iterate over the detected hand landmarks
    for index, hlm in enumerate(hand_results.hand_world_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmark[handedness] = np.array([[lm.x, lm.y, lm.z] for lm in hlm])

    return np.concatenate([pose_landmark, hand_landmark.flatten()])

##### Saving the landmarker data

In [48]:
def save_cleaned_landmark(action: str, sequence: int, keypoints: np.ndarray):
    np_path = os.path.join(DATASET_PATH, action, str(sequence))

    np.save(np_path, keypoints)

#### Read the raw data and process it using mediapipe

In [49]:
def process_frame(label, video_num, frame, image, flip_frame, timestamp_ms):
    # start time for performance tracking
    start_time = time.time()

    # flip the image horizontally for a selfie-view display
    if flip_frame: image = np.fliplr(image)

    try:
        image = image.astype(np.uint8)

        # convert image to mediapipe image format
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

        # detect hands and pose
        hand_results = hand_detector.detect(image=mp_image)

        pose_results = pose_detector.detect(image=mp_image)

        # convert results to landmarks
        keypoints = to_landmark_data(hand_results, pose_results)

    except Exception as e:
        print(f"Error processing {label} video {video_num} frame {frame}: {e}")
        return frame, None, time.time() - start_time

    return frame, keypoints, time.time() - start_time


def process_video(label: str, video_num: int, flip_frame: bool, video_start: int = 0):
    video_path = os.path.join(DATASET_PATH_RAW, label, f"{video_num}.avi")

    video_counter = video_start

    try:
        clip = VideoFileClip(video_path)
    except OSError:
        print(f"Error: Could not open video file {video_path}")
        return f"[{label}] ({video_num}) error opening video file"

    avg_exec_time = []
    results = []

    # use ThreadPoolExecutor to process frames concurrently
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_frame = {
            executor.submit(
                process_frame,
                label,
                video_num,
                frame,
                image,
                flip_frame,
                clip.duration * frame / clip.fps,
            ): frame
            for frame, image in enumerate(clip.iter_frames(fps=clip.fps))
        }

        for future in concurrent.futures.as_completed(future_to_frame):
            frame, keypoints, exec_time = future.result()

            if keypoints is not None:
                results.append((frame, keypoints))

            avg_exec_time.append(exec_time)

    start_time = time.time()
    results.sort(key=lambda x: x[0])

    if len(results) == 60:
        # combine all landmark sequences into a single numpy array
        keypoints = np.array([landmark for _, landmark in results])

        save_cleaned_landmark(label, video_counter, keypoints)

        end_time = time.time() - start_time
        avg_exec_time = {
            "avg_video_exec": avg_exec_time,
            "save_exec": end_time,
        }

        return avg_exec_time, label, video_num, frame

    return None

In [50]:
actions = [
    {"action": word, "frame": i} for word in ACTIONS for i in range(videos_per_label)
]

log = []
all_first_5 = actions[:5] + actions[60:65] + actions[120:125] + actions[180:185]
all_first_1 = actions[:1] + actions[60:61] + actions[120:121] + actions[180:181]


def process_action_frame(label, frame, log):
    print(f"[{label}]\t{frame}")
    result1 = process_video(label, frame, False, frame)  # without flip

    # the flip image should start at 120th video
    # so in the end we have 240 data on each action
    result2 = process_video(label, frame, True, 120 + frame)  # with fip image

    if not result1 or not result2:
        return None

    log.append([result1, result2])

    return log


with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_action = {
        executor.submit(process_action_frame, action["action"], action["frame"], log): action
        for action in actions
    }

    for future in concurrent.futures.as_completed(future_to_action):
        log = future.result()

        if not log:
            break

[_]	0
[_]	1
[_]	2
[_]	3
[_]	4
[_]	5
[_]	6
[_]	7
[_]	8
[_]	9
[_]	10
[_]	11
[_]	12
[_]	13
[_]	14
[_]	15
[_]	16
[_]	17
[_]	18
[_]	19
[_]	20
[_]	21
[_]	22
[_]	23
[_]	24
[_]	25
[_]	26
[_]	27
[_]	28
[_]	29
[_]	30
[_]	31
[_]	32
[_]	33
[_]	34
[_]	35
[_]	36
[_]	37
[_]	38
[_]	39
[_]	40
[_]	41
[_]	42
[_]	43
[_]	44
[_]	45
[_]	46
[_]	47
[_]	48
[_]	49
[_]	50
[_]	51
[_]	52
[_]	53
[_]	54
[_]	55
[_]	56
[_]	57
[_]	58
[_]	59
[_]	60
[_]	61
[_]	62
[_]	63
[_]	64
[_]	65
[_]	66
[_]	67
[_]	68
[_]	69
[_]	70
[_]	71
[_]	72
[_]	73
[_]	74
[_]	75
[_]	76
[_]	77
[_]	78
[_]	79
[_]	80
[_]	81
[_]	82
[_]	83
[_]	84
[_]	85
[_]	86
[_]	87
[_]	88
[_]	89
[_]	90
[_]	91
[_]	92
[_]	93
[_]	94
[_]	95
[_]	96
[_]	97
[_]	98
[_]	99
[_]	100
[_]	101
[_]	102
[_]	103
[_]	104
[_]	105
[_]	106
[_]	107
[_]	108
[_]	109
[_]	110
[_]	111
[_]	112
[_]	113
[_]	114
[_]	115
[_]	116
[_]	117
[_]	118
[_]	119
[hello]	0
[hello]	1
[hello]	2
[hello]	3
[hello]	4
[hello]	5
[hello]	6
[hello]	7
[hello]	8
[hello]	9
[hello]	10
[hello]	11
[hello]	12
[hello]	13
[hello

In [20]:
# NEW : +- 14 minute ~
# OLD : +- 24 minute ~