In [19]:
import time
from datetime import datetime

import os
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

import matplotlib.pyplot as plt
import concurrent.futures
from moviepy.editor import VideoFileClip

#### Setting up for Mediapipe

In [20]:
drawer = mp.solutions.drawing_utils
VisionRunningMode = mp.tasks.vision.RunningMode

In [21]:
# base options for hand and pose detection models
hand_base_options = python.BaseOptions(model_asset_path="../tasks/hand_landmarker.task")
pose_base_options = python.BaseOptions(model_asset_path="../tasks/pose_landmarker.task")

In [22]:
# options for hand detection
hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    min_hand_detection_confidence=0.8,
    min_hand_presence_confidence=0.9,
    min_tracking_confidence=0.8,
    running_mode=VisionRunningMode.IMAGE,
)

# options for pose detection
pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    min_pose_detection_confidence=0.95,
    min_pose_presence_confidence=0.95,
    min_tracking_confidence=0.95,
    running_mode=VisionRunningMode.IMAGE,
)

# create detectors
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

#### Extracting Mediapipe Landmark

In [23]:
empty_hand_landmark = np.zeros((2, 21, 3))  # right hand and left hand
empty_pose_landmark = np.zeros(33 * 3)

def to_landmark_data(
    hand_results: vision.HandLandmarkerResult, pose_results: vision.PoseLandmarkerResult
):
    """
    Extract keypoints from pose and hand results for dataset creation.
    """
    pose_landmark = empty_pose_landmark
    hand_landmark = empty_hand_landmark

    if pose_results.pose_world_landmarks:
        pose_landmark = np.array(
            [[lm.x, lm.y, lm.z] for lm in pose_results.pose_world_landmarks[0]]
        ).flatten()

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate([pose_landmark, hand_landmark.flatten()])

    # iterate over the detected hand landmarks
    for index, hlm in enumerate(hand_results.hand_world_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmark[handedness] = np.array([[lm.x, lm.y, lm.z] for lm in hlm])

    return np.concatenate([pose_landmark, hand_landmark.flatten()])

LandmarkList = landmark_pb2.NormalizedLandmarkList  # aliases for landmark types
NormalizedLandmark = landmark_pb2.NormalizedLandmark  # aliases for landmark types


def to_landmark_list(landmarks):
    """
    Create a LandmarkList from a list of landmarks or fill with empty values if no landmarks are provided.
    """
    return LandmarkList(
        landmark=([NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks])
    )


empty_pose_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(33 * 4)]
)

empty_hand_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(21 * 3)]
)


def to_drawing_landmark(hand_results, pose_results):
    """
    Convert pose and hand landmarks to LandmarkList for drawing.
    """
    pose_landmarks = (
        to_landmark_list(pose_results.pose_landmarks[0])
        if pose_results.pose_landmarks
        else empty_pose_landmarks
    )

    hand_landmarks = [empty_hand_landmarks, empty_hand_landmarks]

    if not hand_results:
        return pose_landmarks, None

    # iterate over the detected hand landmarks
    for index, hand_landmark in enumerate(hand_results.hand_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmarks[handedness] = to_landmark_list(hand_landmark)

    return pose_landmarks, hand_landmarks


def draw_landmark(image, hand_landmarks, pose_landmarks):
    """
    Draw detected landmarks on the image.
    """
    drawer.draw_landmarks(
        image,
        pose_landmarks,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    if not hand_landmarks:
        return

    for hand_landmarks in hand_landmarks:
        drawer.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

#### Load Model

In [24]:
# action lables
ACTIONS = [
    "_", "hello", "thanks", "i-love-you", "see-you-later", "I", "Father", "Mother", "Yes",
    "No", "Help", "Please", "Want", "What", "Again", "Eat", "Milk", "More", "Go To",
    "Bathroom", "Fine", "Like", "Learn", "Sign", "Done"
]

# limit to x actions for preprocessing
# NOTE: change this number into the amount of the dataset labels (if changed)
ACTIONS = np.array(ACTIONS[:4])

In [25]:
def load_model(use_latest=True, version=""):
    model_dir = "../../storage/models/keras"
    prefix = "singa_slr_v_"

    if not use_latest and not version:
        _version = os.path.join(model_dir, f"{prefix}{version}")

        tf.keras.models.load_model(_version)

    model_files = os.listdir(model_dir)

    # filter model files by filename prefix
    versions = [file for file in model_files if file.startswith(prefix)]

    # extract version numbers from filenames
    versions = [file.split("_")[-1] for file in versions]

    # convert version numbers to tuples of integers for comparison
    versions_int = [tuple(map(int, v.split(".")[0])) for v in versions]

    # find the index of the latest version
    latest_index = versions_int.index(max(versions_int))

    # load the latest model
    latest_model_path = model_files[latest_index]

    return tf.keras.models.load_model(os.path.join(model_dir, latest_model_path))


model = load_model()

#### Model Prediction

In [26]:
colors = [
    (245, 117, 16),
    (117, 245, 16),
    (16, 117, 245),
    (117, 117, 16),
    (16, 245, 117),
    (245, 117, 245),
]


def confidence_bar(res, actions, input_frame, colors):
    output_frame = input_frame.copy()

    for num, prob in enumerate(res):
        cv2.rectangle(
            output_frame,
            (0, 60 + num * 40),
            (int(prob * 100), 90 + num * 40),
            colors[num],
            -1,
        )

        cv2.putText(
            output_frame,
            actions[num],
            (0, 85 + num * 40),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (255, 255, 255),
            2,
            cv2.LINE_AA,
        )

    return output_frame

In [27]:
def process_frame(frame, image, threshold, skip_word):
    start_time = time.time()

    # Convert into mediapipe numpy type support uint8, uint16, or float32
    image = image.astype(np.uint8)

    # Convert cv image to mediapipe image format before being passed to detectors
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

    try:
        hand_results = hand_detector.detect(image=mp_image)
        pose_results = pose_detector.detect(image=mp_image)

        landmarks = to_landmark_data(hand_results, pose_results)
    except:
        print(f"frame {frame} skipped")
        return frame, None, time.time() - start_time

    return frame, landmarks, time.time() - start_time


def predict_from_video(vid):
    clip = VideoFileClip(vid)

    avg_exec_time = []

    predictions = []
    sequences = []

    sentence = []
    threshold = 0.2
    skip_word = "_"

    results = []
    batch_size = 60

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_frame = {
            executor.submit(
                process_frame,
                frame,
                image,
                threshold,
                skip_word,
            ): frame
            for frame, image in enumerate(clip.iter_frames(fps=clip.fps))
        }

        for future in concurrent.futures.as_completed(future_to_frame):
            frame, landmarks, exec_time = future.result()
            avg_exec_time.append(exec_time)

            if landmarks is not None:
                results.append((frame, landmarks))

    # sort the results by frame number to ensure the order is correct
    results.sort(key=lambda x: x[0])

    for _, landmarks in results:
        sequences.append(landmarks)

        if len(sequences) < batch_size:
            continue

        # collect a batch of sequences
        batch_motion = np.stack(sequences[-batch_size:])
        sequences = sequences[
            -(batch_size - 40) :
        ]  # keep the last 20 sequences for overlap

        # ensure correct input shape by adding an extra dimension for batch size
        batch_motion = np.expand_dims(batch_motion, axis=0)

        # predict the entire batch
        batch_result = model.predict(batch_motion, verbose=0)

        print(batch_result)

        for result in batch_result:
            # len of results is 480 (which is the total frame)?
            predicted = np.argmax(result)

            if (not result[predicted] > threshold) or not (
                ACTIONS[predicted] != skip_word
            ):
                continue

            if not predictions or predicted != predictions[-1]:
                predictions.append(predicted)

    print("===")
    print(predictions)
    print("===")

    for motion in predictions:
        sentence.append(ACTIONS[motion])

    return (
        sentence,
        len(results),
        {
            "avg_exec_time": avg_exec_time,
            "total_exec_time": sum(avg_exec_time),
        },
    )

In [28]:
sentence, frame, exec_time = predict_from_video("./videos/test_7.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

# [7] hello, ily, ty
# [8] hello, ily, hello, ily, ty

[[1.1759320e-05 9.9997890e-01 1.4928897e-06 7.8213852e-06]]
[[2.5426933e-05 9.9995887e-01 1.0596132e-06 1.4669377e-05]]
[[0.00085969 0.82801086 0.16679908 0.00433033]]
[[9.9994123e-01 1.3760280e-07 3.5500059e-05 2.3141809e-05]]
[[0.02984007 0.00289671 0.06019823 0.90706503]]
[[1.1856037e-04 8.1997598e-05 5.0346971e-06 9.9979442e-01]]
[[3.6143523e-04 1.3123474e-03 6.8629533e-03 9.9146324e-01]]
[[9.9944884e-01 1.7254035e-07 2.1880467e-06 5.4880994e-04]]
[[9.9941373e-01 4.1730927e-06 9.7303775e-05 4.8483367e-04]]
[[2.9218388e-06 7.7728146e-06 9.9992096e-01 6.8270856e-05]]
[[0.84600013 0.00176073 0.00352284 0.14871635]]
===
[1, 3, 2]
===
Total frame calculated: 480
Average execution time per frame: 0.29622118721405666
Predicted sentence: ['hello', 'i-love-you', 'thanks']


In [30]:
sentence, frame, exec_time = predict_from_video("./videos/test_6.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

# [1] ily
# [2] ily
# [3]
# [4]
# [5] ily
# [6] ily
# [7] hello, ily, ty
# [8] hello, ily, hello, ily, ty

[[0.00178309 0.06660198 0.01177465 0.9198402 ]]
===
[3]
===
Total frame calculated: 72
Average execution time per frame: 0.260866211520301
Predicted sentence: ['i-love-you']
