In [156]:
import time

import os
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

import matplotlib.pyplot as plt
import concurrent.futures
from moviepy.editor import VideoFileClip

#### Setting up for Mediapipe

In [157]:
drawer = mp.solutions.drawing_utils
VisionRunningMode = mp.tasks.vision.RunningMode

In [158]:
# base options for hand and pose detection models
hand_base_options = python.BaseOptions(model_asset_path="../tasks/hand_landmarker.task")
pose_base_options = python.BaseOptions(model_asset_path="../tasks/pose_landmarker.task")

In [159]:
# options for hand detection
hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    min_hand_detection_confidence=0.8,
    min_hand_presence_confidence=0.9,
    min_tracking_confidence=0.8,
    running_mode=VisionRunningMode.IMAGE,
)

# options for pose detection
pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    min_pose_detection_confidence=0.95,
    min_pose_presence_confidence=0.95,
    min_tracking_confidence=0.95,
    running_mode=VisionRunningMode.IMAGE,
)

# create detectors
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

#### Extracting Mediapipe Landmark

In [160]:
empty_hand_landmark = np.zeros((2, 21, 3))  # right hand and left hand
empty_pose_landmark = np.zeros(33 * 3)

def to_landmark_data(
    hand_results: vision.HandLandmarkerResult, pose_results: vision.PoseLandmarkerResult
):
    """
    Extract keypoints from pose and hand results for dataset creation.
    """
    pose_landmark = empty_pose_landmark
    hand_landmark = empty_hand_landmark

    if pose_results.pose_world_landmarks:
        pose_landmark = np.array(
            [[lm.x, lm.y, lm.z] for lm in pose_results.pose_world_landmarks[0]]
        ).flatten()

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate([pose_landmark, hand_landmark.flatten()])

    # iterate over the detected hand landmarks
    for index, hlm in enumerate(hand_results.hand_world_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmark[handedness] = np.array([[lm.x, lm.y, lm.z] for lm in hlm])

    return np.concatenate([pose_landmark, hand_landmark.flatten()])

LandmarkList = landmark_pb2.NormalizedLandmarkList  # aliases for landmark types
NormalizedLandmark = landmark_pb2.NormalizedLandmark  # aliases for landmark types


def to_landmark_list(landmarks):
    """
    Create a LandmarkList from a list of landmarks or fill with empty values if no landmarks are provided.
    """
    return LandmarkList(
        landmark=([NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks])
    )


empty_pose_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(33 * 4)]
)

empty_hand_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(21 * 3)]
)


def to_drawing_landmark(hand_results, pose_results):
    """
    Convert pose and hand landmarks to LandmarkList for drawing.
    """
    pose_landmarks = (
        to_landmark_list(pose_results.pose_landmarks[0])
        if pose_results.pose_landmarks
        else empty_pose_landmarks
    )

    hand_landmarks = [empty_hand_landmarks, empty_hand_landmarks]

    if not hand_results:
        return pose_landmarks, None

    # iterate over the detected hand landmarks
    for index, hand_landmark in enumerate(hand_results.hand_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmarks[handedness] = to_landmark_list(hand_landmark)

    return pose_landmarks, hand_landmarks


def draw_landmark(image, hand_landmarks, pose_landmarks):
    """
    Draw detected landmarks on the image.
    """
    drawer.draw_landmarks(
        image,
        pose_landmarks,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    if not hand_landmarks:
        return

    for hand_landmarks in hand_landmarks:
        drawer.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

#### Load Model

In [161]:
# action lables
ACTIONS = [
    "_", "hello", "thanks", "i-love-you", "I", "Yes", "No", "Help", "Please",
    "Want", "Eat", "More", "Bathroom", "Learn", "Sign",
]

# limit to x actions for preprocessing
# NOTE: change this number into the amount of the dataset labels (if changed)
ACTIONS = np.array(ACTIONS[:8])

In [162]:
def load_model(model_version=None):
    model_dir = "../../storage/models/keras"
    prefix = "singa_slr_v_"

    if model_version:
        version = f"{prefix}{model_version}.keras"
        ks_file = os.path.join(model_dir, version)

        model = tf.keras.models.load_model(ks_file)

        return version, model

    model_files = os.listdir(model_dir)

    # filter model files by filename prefix
    versions = [file for file in model_files if file.startswith(prefix)]

    # extract version numbers from filenames
    versions = [file.split("_")[-1] for file in versions]

    # convert version numbers to tuples of integers for comparison
    versions_int = [tuple(map(int, v.split(".")[0])) for v in versions]

    # find the index of the latest version
    latest_index = versions_int.index(max(versions_int))

    # load the latest model
    latest_model_path = model_files[latest_index]

    model = tf.keras.models.load_model(os.path.join(model_dir, latest_model_path))

    return latest_model_path, model


v, model = load_model()

In [163]:
f"using model {v}"

'using model singa_slr_v_002.keras'

#### Model Prediction

In [164]:
colors = [
    (245, 117, 16),
    (117, 245, 16),
    (16, 117, 245),
    (117, 117, 16),
    (16, 245, 117),
    (245, 117, 245),
]


def confidence_bar(res, actions, input_frame, colors):
    output_frame = input_frame.copy()

    for num, prob in enumerate(res):
        cv2.rectangle(
            output_frame,
            (0, 60 + num * 40),
            (int(prob * 100), 90 + num * 40),
            colors[num],
            -1,
        )

        cv2.putText(
            output_frame,
            actions[num],
            (0, 85 + num * 40),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (255, 255, 255),
            2,
            cv2.LINE_AA,
        )

    return output_frame

In [165]:
def process_frame(frame, image, threshold, skip_word):
    start_time = time.time()

    # Convert into mediapipe numpy type support uint8, uint16, or float32
    image = image.astype(np.uint8)

    # Convert cv image to mediapipe image format before being passed to detectors
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

    try:
        hand_results = hand_detector.detect(image=mp_image)
        pose_results = pose_detector.detect(image=mp_image)

        landmarks = to_landmark_data(hand_results, pose_results)
    except:
        print(f"frame {frame} skipped")
        return frame, None, time.time() - start_time

    return frame, landmarks, time.time() - start_time


def predict_from_video(vid):
    clip = VideoFileClip(vid)

    avg_exec_time = []

    predictions = []
    sequences = []

    sentence = []
    threshold = 0.9
    skip_word = "_"

    results = []
    batch_size = 60

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_frame = {
            executor.submit(
                process_frame,
                frame,
                image,
                threshold,
                skip_word,
            ): frame
            for frame, image in enumerate(clip.iter_frames(fps=clip.fps))
        }

        for future in concurrent.futures.as_completed(future_to_frame):
            frame, landmarks, exec_time = future.result()
            avg_exec_time.append(exec_time)

            if landmarks is not None:
                results.append((frame, landmarks))

    # sort the results by frame number to ensure the order is correct
    results.sort(key=lambda x: x[0])

    for _, landmarks in results:
        sequences.append(landmarks)

        if len(sequences) < batch_size:
            continue

        # collect a batch of sequences
        batch_motion = np.stack(sequences[-batch_size:])
        # sequences = sequences[
        #     -(batch_size - 50) :
        # ]  # keep the last 10 sequences for overlap
        sequences = []

        # ensure correct input shape by adding an extra dimension for batch size
        batch_motion = np.expand_dims(batch_motion, axis=0)

        # predict the entire batch
        batch_result = model.predict(batch_motion, verbose=0)

        print(batch_result)
        print("="*50)

        for result in batch_result:
            # len of results is 480 (which is the total frame)?
            predicted = np.argmax(result)

            if (not result[predicted] > threshold) or not (
                ACTIONS[predicted] != skip_word
            ):
                continue

            if not predictions or predicted != predictions[-1]:
                predictions.append(predicted)

    print("===")
    print(predictions)
    print("===")

    for motion in predictions:
        sentence.append(ACTIONS[motion])

    return (
        sentence,
        len(results),
        {
            "avg_exec_time": avg_exec_time,
            "total_exec_time": sum(avg_exec_time),
        },
    )

In [166]:
sentence, frame, exec_time = predict_from_video("./videos/test_7.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

# [7] hello, ily, ty
# [8] hello, ily, hello, ily, ty

[[2.9038884e-07 9.9983108e-01 6.9201727e-05 2.1489195e-05 1.0367809e-05
  2.1885424e-05 4.3587061e-06 4.1192088e-05]]
[[2.9635783e-07 9.9982822e-01 7.2698116e-05 2.1974707e-05 1.0653935e-05
  2.1973745e-05 4.3730915e-06 3.9761049e-05]]
[[9.9717218e-01 2.0220684e-06 4.2375018e-06 2.9208300e-07 7.8711273e-06
  1.9463607e-05 7.2564317e-06 2.7867183e-03]]
[[3.4008247e-07 4.1912940e-06 2.7568228e-07 9.9977738e-01 5.0843382e-06
  1.9079327e-04 2.1303342e-05 5.3800824e-07]]
[[2.5288571e-06 5.6290960e-06 2.9149618e-05 9.9919325e-01 3.0110471e-04
  2.9101246e-04 1.7730684e-04 1.1518802e-07]]
[[9.96232450e-01 2.72789894e-05 1.13081434e-04 4.07205807e-05
  2.08293786e-05 1.01911246e-04 4.32418077e-04 3.03132879e-03]]
[[1.3429212e-06 8.9637487e-04 9.8865885e-01 1.0421810e-02 1.1512624e-05
  2.8182117e-07 8.9977157e-06 8.7589046e-07]]
[[2.6108744e-04 6.4633363e-01 2.7772731e-01 6.1119296e-02 6.3538747e-03
  1.4665833e-04 4.4334140e-03 3.6247284e-03]]
===
[1, 3, 2]
===
Total frame calculated: 480
Av

In [154]:
sentence, frame, exec_time = predict_from_video("./videos/test_2.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

# [1] ily
# [2] ily
# [3]
# [4]
# [5] ily
# [6] ily
# [7] hello, ily, ty
# [8] hello, ily, hello, ily, ty

[[7.1363489e-04 7.9979444e-01 8.9973181e-02 9.1978617e-02 4.6807304e-03
  4.0854994e-04 1.0393575e-02 2.0572590e-03]]
[[5.4419711e-06 2.2972850e-04 2.4977826e-06 9.9724305e-01 1.5904606e-04
  1.6380600e-03 7.2130421e-04 9.7168765e-07]]
[[1.5605018e-06 1.9727365e-04 2.4612361e-06 9.9911767e-01 8.0260463e-05
  3.2108623e-04 2.7942861e-04 2.8263034e-07]]
[[2.53623284e-06 1.04762170e-04 1.71410193e-06 9.98634875e-01
  1.10564935e-04 7.43687095e-04 4.01124387e-04 6.58157091e-07]]
[[4.22600315e-05 1.47881598e-04 1.90514572e-06 9.66239154e-01
  6.34514785e-04 2.97440998e-02 3.17958998e-03 1.07539945e-05]]
[[8.1441449e-06 3.6908383e-04 1.6584617e-06 9.9458718e-01 5.9095943e-05
  2.2759386e-03 2.6971055e-03 1.8162965e-06]]
[[1.7057497e-06 1.9037945e-04 2.5090528e-06 9.9905318e-01 9.4927142e-05
  3.7264358e-04 2.8433540e-04 3.0863777e-07]]
[[1.4019095e-06 1.9512053e-04 2.4070468e-06 9.9917728e-01 7.5313459e-05
  2.8599522e-04 2.6221081e-04 2.5986239e-07]]
===
[3]
===
Total frame calculated: 523


In [155]:
sentence, frame, exec_time = predict_from_video("./videos/test_9.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

# [1] ily
# [2] ily
# [3]
# [4]
# [5] ily
# [6] ily
# [7] hello, ily, ty
# [8] hello, ily, hello, ily, ty
# [9] hello, ily, ty, i, no, yes, help, i, ily

[[9.9590009e-01 4.8043617e-05 2.5281514e-04 1.1807740e-04 2.4343699e-05
  1.5194310e-04 8.9926820e-04 2.6054278e-03]]
[[7.2242062e-07 9.9925929e-01 9.0806236e-05 4.9496768e-04 4.6027153e-06
  7.8607460e-05 4.6941521e-05 2.4083156e-05]]
[[4.3365907e-07 7.5851800e-05 3.2848751e-04 9.9911684e-01 2.7582049e-04
  6.1945524e-05 1.4061680e-04 2.1855124e-08]]
[[1.4778279e-06 8.5616054e-04 9.9909544e-01 1.4573611e-05 3.1955027e-05
  4.7149403e-08 2.5412004e-07 7.1912766e-08]]
[[9.9493849e-01 5.3102231e-06 1.1562866e-05 5.2806239e-08 1.7457731e-06
  8.5183874e-06 3.0405242e-06 5.0312979e-03]]
[[9.9971336e-01 1.7990060e-07 6.4763321e-07 3.2386303e-09 4.9042317e-08
  7.8637987e-07 1.4240439e-06 2.8365120e-04]]
[[6.45591062e-04 8.28661806e-10 2.00400918e-09 7.57516361e-09
  1.10255644e-01 8.11898190e-06 1.10784754e-06 8.89089465e-01]]
[[1.6217276e-02 4.7787180e-05 4.6408135e-07 1.8161900e-03 1.2249707e-03
  9.2658293e-01 5.3586498e-02 5.2385469e-04]]
[[8.0922997e-01 6.4996261e-06 4.5744684e-07 3.72