In [None]:
import cv2  # type: ignore
import os
import time
import numpy as np  # type: ignore

import mediapipe as mp  # type: ignore

from matplotlib import pyplot as plt # type: ignore
from mediapipe.tasks import python  # type: ignore
from mediapipe.tasks.python import vision  # type: ignore
from mediapipe.framework.formats import landmark_pb2 # type: ignore

In [None]:
drawer = mp.solutions.drawing_utils # drawing utilities
VisionRunningMode = mp.tasks.vision.RunningMode

Setup mediapipe task vision

[google documentation](https://ai.google.dev/edge/api/mediapipe/python/mp/tasks/vision) for task vision

In [None]:
face_base_options = python.BaseOptions(model_asset_path="./tasks/face_landmarker.task")
hand_base_options = python.BaseOptions(model_asset_path="./tasks/hand_landmarker.task")
pose_base_options = python.BaseOptions(model_asset_path="./tasks/pose_landmarker.task")

In [None]:
face_options = vision.FaceLandmarkerOptions(
    base_options=face_base_options,
    output_face_blendshapes=True,
    output_facial_transformation_matrixes=True,
    num_faces=1,
    running_mode=VisionRunningMode.VIDEO,
)

hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    running_mode=VisionRunningMode.VIDEO,
)

pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    running_mode=VisionRunningMode.VIDEO,
)


face_detector = vision.FaceLandmarker.create_from_options(face_options)
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

Setup keypoint extractor from task vision to be used as landmarker drawer for cv2

In [None]:
LandmarkList = landmark_pb2.NormalizedLandmarkList # alias
NormalizedLandmark = landmark_pb2.NormalizedLandmark # alias


def create_landmark_list(landmarks, num_keypoints):
    """Creates a LandmarkList from a list of landmarks or fills with empty values if no landmarks are provided.

    Args:
        landmarks: A list of landmark objects, each containing x, y, z coordinates.
        num_keypoints: The number of keypoints to be included in the LandmarkList.

    Returns:
        A LandmarkList containing the converted landmarks or empty values if no landmarks are provided.

    """

    empty_landmarks = [
        NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(num_keypoints)
    ] # generate empty landmarks with all coordinates set to 0.0

    return LandmarkList(
        landmark=(
            # convert provided landmarks to NormalizedLandmark objects or use empty landmarks
            [NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks]
            if landmarks
            else empty_landmarks
        )
    )


def extract_keypoints_for_drawing(face_results, pose_results, hand_results):
    """Converts face, pose, and hand landmarks to corresponding prototype lists for drawing.

    Args:
        face_results: Object containing face landmark detection results.
        pose_results: Object containing pose landmark detection results.
        hand_results: Object containing hand landmark detection results.

    Returns:
        A tuple containing three LandmarkList messages: face_landmarks, pose_landmarks, and hand_landmarks.

    """

    # convert face landmarks to LandmarkList, using empty values if no landmarks are present
    face_landmarks_proto = create_landmark_list(
        face_results.face_landmarks[0] if face_results.face_landmarks else None, 478 * 3
    )

    # convert pose landmarks to LandmarkList, using empty values if no landmarks are present
    pose_landmarks_proto = create_landmark_list(
        pose_results.pose_landmarks[0] if pose_results.pose_landmarks else None, 33 * 4
    )

    # convert hand landmarks to LandmarkList, using empty values if no landmarks are present
    hand_landmarks_proto = [
        create_landmark_list(hand_landmarks, 21 * 3)
        for hand_landmarks in (
            hand_results.hand_landmarks
            if hand_results.hand_landmarks
            else [None, None]  # two hands
        )
    ]

    return face_landmarks_proto, pose_landmarks_proto, hand_landmarks_proto

In [None]:
def draw_detection_landmark(
    image,
    face_landmarks_proto=None,
    pose_landmarks_proto=None,
    hand_landmarks_proto=None,
):
    # draw landmark face
    drawer.draw_landmarks(
        image,
        face_landmarks_proto,
        mp.solutions.face_mesh.FACEMESH_CONTOURS,
        drawer.DrawingSpec(color=(80, 60, 20), thickness=1, circle_radius=1),
        drawer.DrawingSpec(color=(80, 146, 241), thickness=1, circle_radius=1),
    )

    # draw landmark pose
    drawer.draw_landmarks(
        image,
        pose_landmarks_proto,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    # draw landmark for both hand (right, left)
    for idx in range(len(hand_landmarks_proto)):
        drawer.draw_landmarks(
            image,
            hand_landmarks_proto[idx],
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

Setup keypoint extractor from task vision to be saved as dataset as npy (numpy array)

In [None]:
def extract_keypoints_for_dataset(face_results, pose_results, hand_results):
    """Extracts keypoints from face, pose, and hand results for dataset creation.

    Handles cases with zero, one, or two hands, assigning hand keypoints based
    on handedness information.

    Args:
      face_results: Object containing face landmark data (if available), assumed to
                    have a `face_landmarks` attribute with landmark data.
      pose_results: Object containing pose landmark data (if available), assumed to
                    have a `pose_landmarks` attribute with landmark data.
      hand_results: Object containing hand landmark data (if available), assumed to
                    have `hand_landmarks` and `handedness` attributes.

    Returns:
      A tuple containing three NumPy arrays representing flattened keypoints for face,
      pose, and hand, respectively. Empty arrays are used for missing modalities.
    """

    # extract face keypoints if available, otherwise return a zero-filled array
    face_keypoints = (
        np.array(
            [
                [landmark.x, landmark.y, landmark.z]
                for landmark in face_results.face_landmarks[0]
            ]
        ).flatten()
        if face_results.face_landmarks
        else np.zeros(478 * 3)  # 478 landmarks with 3 coordinates each (x, y, z)
    )

    # extract pose keypoints if available, otherwise return a zero-filled array
    pose_keypoints = (
        np.array(
            [
                [landmark.x, landmark.y, landmark.z, landmark.visibility]
                for landmark in pose_results.pose_landmarks[0]
            ]
        ).flatten()
        if pose_results.pose_landmarks
        else np.zeros(33 * 4)  # 33 landmarks with 4 values each (x, y, z, visibility)
    )

    # initialize hand keypoints with zeros for two hands (right and left),
    # each with 21 landmarks and 3 coordinates
    hand_keypoints = np.zeros((2, 21, 3))

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate(
            [face_keypoints, pose_keypoints, hand_keypoints.flatten()]
        )

    # iterate over the detected hand landmarks
    for idx in range(len(hand_results.hand_landmarks)):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[idx][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_keypoints[handedness] = np.array(
            [[lm.x, lm.y, lm.z] for lm in hand_results.hand_landmarks[idx]]
        )

    # flatten the hand keypoints array and concatenate it with face and pose keypoints
    return np.concatenate([face_keypoints, pose_keypoints, hand_keypoints.flatten()])

Function to calculate FPS

In [None]:
def calculate_fps(start_time, frames):
    elapsed_time = time.time() - start_time
    return frames / elapsed_time if elapsed_time > 0 else 0


def draw_fps(image, fps):
    cv2.putText(
        image,
        f"FPS: {round(fps, 2)}",
        (10, 40),
        cv2.FONT_HERSHEY_SIMPLEX,
        1.5,
        (0, 255, 0),
        2,
    )

Setting up for data collection

In [None]:
# path for saving the data (numpy array)
DATA_PATH = os.path.join("../datasets")

# sign action to be detected
ACTIONS = np.array(
    [
        "hello",
        "thanks",
        "i-love-you",
        "see-you-later",
        "I",
        "Father",
        "Mother",
        "Yes",
        "No",
        "Help",
        "Please",
        "Want",
        "What",
        "Again",
        "Eat",
        "Milk",
        "More",
        "Go To",
        "Bathroom",
        "Fine",
        "Like",
        "Learn",
        "Sign",
        "Done",
    ]
)

ACTIONS = ACTIONS[3:6]

# 60 videos worth of data (per label)
videos_per_label = 60

# 30 action per videos
# NOTE: This does not affect how much the frame is
action_per_video = 30

In [None]:
# create dataset folder

try:
  os.makedirs(os.path.join(DATA_PATH))
except:
    print("Dataset Folder Exists, skip creating new one")

In [None]:
ACTIONS # check what actions that will be collected for dataset

In [None]:
# NOTE: uncomment this code if you want to continue to add more videos per-action
#       more data for video per-action the better the model will train

# # create the folders to store the data for action per video (continous)
# for action in ACTIONS:
#     dirmax = np.max(np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int), initial=0) + 1

#     for sequence in range(videos_per_label):
#         try:
#             os.makedirs(os.path.join(DATA_PATH, action, str(sequence + dirmax)))
#             print("Folder created : ", action, " - ", str(sequence + dirmax))
#         except:
#             print("skip", action, " - ", str(sequence + dirmax))

In [None]:
# create the folders to store the data for action per video
for action in ACTIONS:
    for sequence in range(videos_per_label):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
            print("Folder created : ", action, " - ", str(sequence))
        except:
            print("skip", action, " - ", str(sequence))

In [None]:
folder_count = np.max(np.array(os.listdir(os.path.join(DATA_PATH, ACTIONS[0]))).astype(int)) + 1
start_folder = folder_count - videos_per_label
start_folder = 0

print(start_folder)

In [None]:
def save_keypoint_as_np(action, sequence, action_length, keypoints):
    np_path = os.path.join(DATA_PATH, action, str(sequence), str(action_length))

    np.save(np_path, keypoints)

### CAPTURING THE DATA (OpenCV)

In [None]:
head = lambda img: cv2.putText(
    img,
    "STARTING COLLECTION",
    (120, 200),
    cv2.FONT_HERSHEY_SIMPLEX,
    1,
    (0, 255, 0),
    4,
    cv2.LINE_AA,
)

subh = lambda img, act, seq: cv2.putText(
    img,
    "Collecting frames for {} Video Number {}".format(act, seq),
    (15, 12),
    cv2.FONT_HERSHEY_SIMPLEX,
    0.5,
    (0, 0, 255),
    1,
    cv2.LINE_AA,
)

In [None]:
cap = cv2.VideoCapture(0)

# set capture properties
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 480)  # set width to 480 pixels
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)  # set height to 480 pixels
cap.set(cv2.CAP_PROP_FPS, 60)  # set frame rate to 60 FPS

start_time = time.time()
frames = 0
isQuit = False

summed = 0
total = 3 * 60 * 30 # should be 5.400

# NOTE (AHMAD): THE AVG FPS LIKE 12?? idk, maybe just device issue
# TODO: Write better code prob
while cap.isOpened():
    for action in ACTIONS:
        for sequence in range(start_folder, (start_folder + videos_per_label)):
            for action_length in range(action_per_video):
                success, image = cap.read()

                if not success:
                    print("Ignoring empty camera frame.")
                    continue

                # NOTE: using flip image will screw'ed up some of the keypoints
                #       data for training the model later
                # image = cv2.flip(image, 1) # flip the image horizontally for a selfie-view display.

                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

                # get current frame timestamp in milliseconds
                timestamp_ms = int(cap.get(cv2.CAP_PROP_POS_MSEC))

                # convert cv image to mediapipe image format before being
                # passed to face, pose and hand detector
                annotated_image = mp.Image(
                    image_format=mp.ImageFormat.SRGB, data=image_rgb
                )

                face_results = face_detector.detect_for_video(
                    image=annotated_image, timestamp_ms=timestamp_ms
                )

                hand_results = hand_detector.detect_for_video(
                    image=annotated_image, timestamp_ms=timestamp_ms + 1
                )

                pose_results = pose_detector.detect_for_video(
                    image=annotated_image, timestamp_ms=timestamp_ms + 2
                )

                frames += 1
                fps = calculate_fps(start_time, frames)

                draw_fps(image_rgb, fps)

                face_proto, pose_proto, hand_proto = extract_keypoints_for_drawing(
                    face_results, pose_results, hand_results
                )

                draw_detection_landmark(
                    image_rgb,
                    face_landmarks_proto=face_proto,
                    pose_landmarks_proto=pose_proto,
                    hand_landmarks_proto=hand_proto,
                )

                if action_length == 0:
                    head(image_rgb)
                    subh(image_rgb, action, sequence)

                    cv2.imshow(
                        "MediaPipe Detection",
                        cv2.cvtColor(image_rgb, cv2.COLOR_BGR2RGB),
                    )
                    cv2.waitKey(1250)

                else:
                    subh(image_rgb, action, sequence)

                    cv2.imshow(
                        "MediaPipe Detection",
                        cv2.cvtColor(image_rgb, cv2.COLOR_BGR2RGB),
                    )

                keypoints = extract_keypoints_for_dataset(
                    face_results, pose_results, hand_results
                )

                save_keypoint_as_np(action, sequence, action_length, keypoints)

                if cv2.waitKey(10) & 0xFF == ord("q"):
                    isQuit = True
                    break

            if isQuit:
                break

        if isQuit:
            break

    cap.release()
    cv2.destroyAllWindows()