In [1]:
import time

import os
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

import matplotlib.pyplot as plt
import concurrent.futures
from moviepy.editor import VideoFileClip

#### Setting up for Mediapipe

In [2]:
drawer = mp.solutions.drawing_utils
VisionRunningMode = mp.tasks.vision.RunningMode

In [3]:
# base options for hand and pose detection models
hand_base_options = python.BaseOptions(model_asset_path="../tasks/hand_landmarker.task")
pose_base_options = python.BaseOptions(model_asset_path="../tasks/pose_landmarker.task")

In [4]:
# options for hand detection
hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    min_hand_detection_confidence=0.6,
    min_hand_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# options for pose detection
pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    min_pose_detection_confidence=0.6,
    min_pose_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# create detectors
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

#### Extracting Mediapipe Landmark

In [5]:
empty_hand_landmark = np.zeros((2, 21, 3))  # right hand and left hand
empty_pose_landmark = np.zeros(33 * 3)

def to_landmark_data(
    hand_results: vision.HandLandmarkerResult, pose_results: vision.PoseLandmarkerResult
):
    """
    Extract keypoints from pose and hand results for dataset creation.
    """
    pose_landmark = empty_pose_landmark
    hand_landmark = empty_hand_landmark

    if pose_results.pose_world_landmarks:
        pose_landmark = np.array(
            [[lm.x, lm.y, lm.z] for lm in pose_results.pose_world_landmarks[0]]
        ).flatten()

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate([pose_landmark, hand_landmark.flatten()])

    # iterate over the detected hand landmarks
    for index, hlm in enumerate(hand_results.hand_world_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmark[handedness] = np.array([[lm.x, lm.y, lm.z] for lm in hlm])

    return np.concatenate([pose_landmark, hand_landmark.flatten()])

LandmarkList = landmark_pb2.NormalizedLandmarkList  # aliases for landmark types
NormalizedLandmark = landmark_pb2.NormalizedLandmark  # aliases for landmark types


def to_landmark_list(landmarks):
    """
    Create a LandmarkList from a list of landmarks or fill with empty values if no landmarks are provided.
    """
    return LandmarkList(
        landmark=([NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks])
    )


empty_pose_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(33 * 4)]
)

empty_hand_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(21 * 3)]
)


def to_drawing_landmark(hand_results, pose_results):
    """
    Convert pose and hand landmarks to LandmarkList for drawing.
    """
    pose_landmarks = (
        to_landmark_list(pose_results.pose_landmarks[0])
        if pose_results.pose_landmarks
        else empty_pose_landmarks
    )

    hand_landmarks = [empty_hand_landmarks, empty_hand_landmarks]

    if not hand_results:
        return pose_landmarks, None

    # iterate over the detected hand landmarks
    for index, hand_landmark in enumerate(hand_results.hand_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmarks[handedness] = to_landmark_list(hand_landmark)

    return pose_landmarks, hand_landmarks


def draw_landmark(image, hand_landmarks, pose_landmarks):
    """
    Draw detected landmarks on the image.
    """
    drawer.draw_landmarks(
        image,
        pose_landmarks,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    if not hand_landmarks:
        return

    for hand_landmarks in hand_landmarks:
        drawer.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

#### Load Model

In [6]:
# action lables
ACTIONS = np.array([
    "_", "hello", "what's up", "how",
    "thanks", "you", "morning", "afternoon",
    "night", "me", "name", "fine",
    "happy", "yes", "no", "repeat",
    "please", "want", "good bye", "learn",
])

In [7]:
def load_model(model_version=None):
    model_dir = "../../storage/models/keras"
    prefix = "singa_slr_v_"

    if model_version:
        version = f"{prefix}{model_version}.keras"
        ks_file = os.path.join(model_dir, version)

        model = tf.keras.models.load_model(ks_file)

        return version, model

    model_files = os.listdir(model_dir)

    # filter model files by filename prefix
    versions = [file for file in model_files if file.startswith(prefix)]

    # extract version numbers from filenames
    versions = [file.split("_")[-1] for file in versions]

    # convert version numbers to tuples of integers for comparison
    versions_int = [tuple(map(int, v.split(".")[0])) for v in versions]

    # find the index of the latest version
    latest_index = versions_int.index(max(versions_int))

    # load the latest model
    latest_model_path = model_files[latest_index]

    model = tf.keras.models.load_model(os.path.join(model_dir, latest_model_path))

    return latest_model_path, model


v, model = load_model("006")

In [8]:
f"using model {v}"

'using model singa_slr_v_006.keras'

In [9]:
model.summary()

#### Model Prediction

In [10]:
def process_frame(frame, image):
    start_time = time.time()

    # Convert into mediapipe numpy type support uint8, uint16, or float32
    image = image.astype(np.uint8)

    # Convert cv image to mediapipe image format before being passed to detectors
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

    try:
        hand_results = hand_detector.detect(image=mp_image)
        pose_results = pose_detector.detect(image=mp_image)

        landmarks = to_landmark_data(hand_results, pose_results)
    except:
        print(f"frame {frame} skipped")
        return frame, None, time.time() - start_time

    return frame, landmarks, time.time() - start_time

lp = None

def predict_from_video(vid):
    global lp
    clip = VideoFileClip(vid)

    avg_exec_time = []

    predictions = []
    sequences = []

    sentence = []
    threshold = 0.999
    skip_word = "_"

    results = []
    batch_size = 60

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_frame = {
            executor.submit(
                process_frame,
                frame,
                image,
            ): frame
            for frame, image in enumerate(clip.iter_frames(fps=clip.fps))
        }

        for future in concurrent.futures.as_completed(future_to_frame):
            frame, landmarks, exec_time = future.result()
            avg_exec_time.append(exec_time)

            if landmarks is not None:
                results.append((frame, landmarks))

    start_time = time.time()
    # sort the results by frame number to ensure the order is correct
    results.sort(key=lambda x: x[0])

    print("res:",len(results))
    i = 0
    for _, landmarks in results:
        sequences.append(landmarks)

        if len(sequences) < batch_size:
            continue

        # collect a batch of sequences
        batch_motion = np.stack(sequences[-batch_size:])
        sequences = sequences[-30:]  # keep the last 4 sequences for overlap
        # sequences = []

        lp = batch_motion

        # ensure correct input shape by adding an extra dimension for batch size
        batch_motion = np.expand_dims(batch_motion, axis=0)

        # predict the entire batch
        batch_result = model.predict(batch_motion, verbose=0)

        # print(i, end="-")
        # i += 1

        print("btres:",batch_result)

        for result in batch_result:
            predicted = np.argmax(result)

            # print(f"{predicted}\t{ACTIONS[predicted]}\t{result}")
            if (not result[predicted] > threshold) or not (
                ACTIONS[predicted] != skip_word
            ):
                continue

            if not predictions or predicted != predictions[-1][0]:
                predictions.append((predicted, result[np.argmax(result)]))

    end_time = time.time() - start_time

    print("\n\n")
    for motion, acc in predictions:
        print(
            f"{acc}\t{ACTIONS[motion]}",
        )
        sentence.append(ACTIONS[motion])

    return (
        sentence,
        len(results),
        {
            "pred_exec_time": end_time,
            "avg_exec_time": avg_exec_time,
            "total_exec_time": sum(avg_exec_time),
        },
    )

In [11]:
sentence, frame, exec_time = predict_from_video("./demo/demo.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Total prediction execution:", np.mean(exec_time["pred_exec_time"]))
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

res: 601
btres: [[1.0000000e+00 2.2463479e-18 5.5886703e-30 2.0303292e-24 7.2052258e-25
  3.1545842e-16 7.6105130e-36 5.8245467e-15 1.0037852e-20 4.4412433e-19
  5.9175464e-26 4.5141435e-34 9.8307070e-23 1.9134562e-10 2.3353894e-23
  1.4274473e-09 6.3096824e-18 1.0654808e-17 2.1864319e-18 3.3560972e-35]]
btres: [[9.5536792e-01 2.6333768e-07 3.9063623e-09 4.5493732e-07 3.7492316e-09
  3.7650115e-04 5.4383733e-13 1.4743696e-04 2.1794519e-07 1.7057588e-06
  1.3628726e-07 4.0259665e-10 9.0059897e-09 1.3484567e-04 2.1214772e-08
  3.6341488e-02 3.5096982e-05 7.5512384e-03 4.2676118e-05 4.2143476e-08]]
btres: [[9.79801865e-11 9.99554932e-01 5.89565774e-09 2.46283497e-11
  6.03663164e-09 3.47771700e-10 2.41856497e-07 1.43691004e-05
  1.20740982e-08 4.65189085e-13 7.83542564e-09 1.88689731e-09
  1.27139349e-11 9.20486576e-10 1.05293975e-10 4.00679593e-04
  1.70158942e-10 5.49923385e-10 2.97119168e-05 1.53891466e-11]]
btres: [[2.8391748e-08 7.2604209e-01 1.2593672e-05 7.2570938e-06 5.5127616e-06

In [12]:
sentence, frame, exec_time = predict_from_video("./demo/demo2.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Total prediction execution:", np.mean(exec_time["pred_exec_time"]))
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

res: 721
btres: [[1.0000000e+00 2.2447889e-18 5.5852604e-30 2.0285639e-24 7.2006370e-25
  3.1546202e-16 7.6046506e-36 5.8217701e-15 1.0029699e-20 4.4358083e-19
  5.9118604e-26 4.5105289e-34 9.8291696e-23 1.9137737e-10 2.3337776e-23
  1.4269274e-09 6.3047979e-18 1.0639617e-17 2.1852561e-18 3.3513637e-35]]
btres: [[9.5536792e-01 2.6333768e-07 3.9063623e-09 4.5493732e-07 3.7492316e-09
  3.7650115e-04 5.4383733e-13 1.4743696e-04 2.1794519e-07 1.7057588e-06
  1.3628726e-07 4.0259665e-10 9.0059897e-09 1.3484567e-04 2.1214772e-08
  3.6341488e-02 3.5096982e-05 7.5512384e-03 4.2676118e-05 4.2143476e-08]]
btres: [[9.1313554e-11 9.9956161e-01 5.1206395e-09 2.1632108e-11 5.2667457e-09
  3.0532837e-10 2.2401437e-07 1.4337303e-05 1.1238671e-08 4.3393278e-13
  7.3100330e-09 1.6582605e-09 1.1239340e-11 8.5917262e-10 9.9317492e-11
  3.9639542e-04 1.4863355e-10 5.1798016e-10 2.7419372e-05 1.3296709e-11]]
btres: [[2.1842603e-02 4.4980991e-01 7.5845506e-05 4.3948913e-07 3.4884925e-04
  1.5035749e-03 1.724

In [13]:
sentence, frame, exec_time = predict_from_video("./demo/demo3.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Total prediction execution:", np.mean(exec_time["pred_exec_time"]))
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

res: 2321
btres: [[1.0000000e+00 9.1583173e-19 1.0026282e-30 3.8589754e-25 1.8260105e-25
  1.1916038e-16 7.8609337e-37 2.0592298e-15 2.3615421e-21 1.5184879e-19
  1.2986783e-26 5.6472689e-35 1.9072460e-23 9.4298250e-11 5.7283383e-24
  1.0301872e-09 2.4708412e-18 4.6377966e-18 7.5801544e-19 4.5233792e-36]]
btres: [[1.0000000e+00 7.2510740e-19 6.2890956e-31 2.6345055e-25 1.2296161e-25
  9.1932595e-17 4.6866779e-37 1.7196324e-15 1.6919791e-21 1.2096608e-19
  8.9259504e-27 3.4687128e-35 1.3550478e-23 8.2549995e-11 4.3834165e-24
  9.3464758e-10 1.8345139e-18 3.7860098e-18 5.5603980e-19 2.6954746e-36]]
btres: [[1.00000000e+00 9.62107522e-19 9.90433172e-31 3.49278939e-25
  1.79144370e-25 1.15841457e-16 7.49391953e-37 2.01723713e-15
  2.15006920e-21 1.47637445e-19 1.18915936e-26 5.35991904e-35
  1.80259908e-23 9.80821604e-11 5.84992702e-24 1.04125375e-09
  2.32557218e-18 4.40760832e-18 7.19571601e-19 4.14490508e-36]]
btres: [[1.0000000e+00 8.7789899e-19 8.7674566e-31 3.2104751e-25 1.6168180e-2