In [34]:
import os
import shutil
import time

import cv2
import numpy as np

#### Setting up for data collection

Reference for how to do the sign language on [youtube](https://www.youtube.com/watch?v=0FcwzMq4iWg)

In [35]:
# path for saving the dataset
DATASET_PATH = os.path.join("../storage/datasets/raw")

ACTIONS = [
    "_", "hello", "what's up", "how",
    "thanks", "you", "morning", "afternoon",
    "night", "me", "name", "fine",
    "happy", "yes", "no", "repeat",
    "please", "want", "good bye", "learn",
]

# number of videos and actions per video
videos_per_label = 60
frames_per_video = 60

In [36]:
# create dataset directories if they do not exist
try:
    try:
        shutil.rmtree(DATASET_PATH)
    except:
        pass

    os.makedirs(DATASET_PATH)
except FileExistsError:
    print("Dataset folder exists, skipping creation.")

# create directories for each action
for action in ACTIONS:
    try:
        os.makedirs(os.path.join(DATASET_PATH, action))
        print(f"[CREATED] {action}")
    except FileExistsError:
        print(f"[SKIPPED] {action}")

[CREATED] hello


##### Saving the landmarker data

In [37]:
def video_path(action, sequence):
    return os.path.join(DATASET_PATH, action, f"{sequence}.avi")

#### Recording the dataset

In [38]:
def display_starting_text(
    img,
):
    cv2.putText(
        img,
        "STARTING COLLECTION",
        (120, 200),
        cv2.FONT_HERSHEY_SIMPLEX,
        1,
        (0, 255, 0),
        4,
        cv2.LINE_AA,
    )


def display_collecting_text(img, act, seq, pos=(15, 12)):
    cv2.putText(
        img,
        f"Collecting frames for {act} Video Number {seq}",
        pos,
        cv2.FONT_HERSHEY_SIMPLEX,
        0.5,
        (0, 0, 255),
        1,
        cv2.LINE_AA,
    )

In [39]:
# capture video from webcam
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
cap.set(cv2.CAP_PROP_FPS, 60)

start_time = time.time()
frames = 0
is_quit = False

# define the codec for VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*"XVID")

##### For Debugging

In [40]:
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

drawer = mp.solutions.drawing_utils  # Drawing utilities
VisionRunningMode = mp.tasks.vision.RunningMode

# base options for hand and pose detection models
hand_base_options = python.BaseOptions(
    model_asset_path="./tasks/hand_landmarker.task"
)
pose_base_options = python.BaseOptions(
    model_asset_path="./tasks/pose_landmarker.task"
)

# options for hand detection
hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    min_hand_detection_confidence=0.6,
    min_hand_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# options for pose detection
pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    min_pose_detection_confidence=0.6,
    min_pose_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# create detectors
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

LandmarkList = landmark_pb2.NormalizedLandmarkList  # aliases for landmark types
NormalizedLandmark = landmark_pb2.NormalizedLandmark  # aliases for landmark types


def to_landmark_list(landmarks):
    """
    Create a LandmarkList from a list of landmarks or fill with empty values if no landmarks are provided.
    """
    return LandmarkList(
        landmark=([NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks])
    )


empty_pose_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(33 * 3)]
)

empty_hand_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(21 * 3)]
)


def to_drawing_landmark(hand_results, pose_results):
    """
    Convert pose and hand landmarks to LandmarkList for drawing.
    """

    pose_landmarks = (
        to_landmark_list(pose_results.pose_landmarks[0])
        if pose_results.pose_landmarks
        else empty_pose_landmarks
    )

    hand_landmarks = [empty_hand_landmarks, empty_hand_landmarks]

    if not hand_results:
        return pose_landmarks, None

    # iterate over the detected hand landmarks
    for index, hand_landmark in enumerate(hand_results.hand_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmarks[handedness] = to_landmark_list(hand_landmark)

    return hand_landmarks, pose_landmarks


def draw_landmark(image, hand_landmarks, pose_landmarks):
    """
    Draw detected landmarks on the image.
    """
    drawer.draw_landmarks(
        image,
        pose_landmarks,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    if not hand_landmarks:
        return

    for hand_landmarks in hand_landmarks:
        drawer.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

##### Recording

In [None]:
debug_landmark = False

In [41]:
# while the video capture is opened (i.e., the camera is functioning)
while cap.isOpened():
    # loop through each action in the predefined ACTIONS list
    for action in ACTIONS:
        # loop through the number of video sequences per label
        for sequence in range(videos_per_label):
            # initialize VideoWriter for each video sequence
            out = cv2.VideoWriter(
                video_path(action, sequence),  # path for saving the video
                fourcc,                        # codec used for compression
                60.0,                          # frames per second
                (640, 480)                     # frame/image size (width, height)
            )

            # loop through each frame in the sequence
            for action_length in range(frames_per_video + 1):
                success, frame = cap.read()
                # create a black image for pauses or displaying text
                pause_image = np.zeros((480, 640, 3), dtype=np.uint8)

                # if frame capture fails, ignore and continue
                if not success:
                    print("Ignoring empty camera frame")
                    continue

                image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                if debug_landmark:
                    image_rgb = image_rgb.astype(np.uint8)
                    # convert image to mediapipe image format
                    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image_rgb)
                    # detect hands and pose
                    hand_results = hand_detector.detect(image=mp_image)
                    pose_results = pose_detector.detect(image=mp_image)
                    hand, pose = to_drawing_landmark(hand_results, pose_results)
                    draw_landmark(image_rgb, hand, pose)

                # for the first frame (per each sequence), display starting and collecting text
                # this gives time for the user to adjust to the next sign language action
                if action_length == 0:
                    display_starting_text(pause_image)
                    display_collecting_text(pause_image, action, sequence)
                    cv2.imshow("Detecting Sign Language", pause_image)
                    cv2.waitKey(1500)  # wait for 1.5 seconds to give time for adjustment

                # if frame reaches 60 (59 since the frame started from 0), break out of the loop
                # NOTE: this might needed since using the last code
                #       it only gives us only 58 frame also adding + 1
                #       in frames_per_video loop is crucial
                elif action_length == 60:
                    out.write(frame)
                    break

                # for other frames, display collecting text and show the frame
                else:
                    display_collecting_text(image_rgb, action, sequence)
                    cv2.imshow("Detecting Sign Language", cv2.cvtColor(image_rgb, cv2.COLOR_BGR2RGB))
                    # write the frame to the video file
                    out.write(frame)

                # break the loop if 'q' key is pressed
                if cv2.waitKey(10) & 0xFF == ord("q"):
                    is_quit = True
                    break

            # break out of the action sequence loop
            if is_quit:
                break

        # break out of the action loop
        if is_quit:
            break

    # release the opencv related video object
    cap.release()
    out.release()
    cv2.destroyAllWindows()


In [225]:
cap.release()
out.release()
cv2.destroyAllWindows()