In [3]:
import time

import pprint
import os
import mediapipe as mp
import numpy as np
import tensorflow as tf
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

import matplotlib.pyplot as plt
import concurrent.futures
from moviepy.editor import VideoFileClip

In [4]:
drawer = mp.solutions.drawing_utils
VisionRunningMode = mp.tasks.vision.RunningMode

In [5]:
# base options for hand and pose detection models
hand_base_options = python.BaseOptions(model_asset_path="../tasks/hand_landmarker.task")
pose_base_options = python.BaseOptions(model_asset_path="../tasks/pose_landmarker.task")

In [6]:
# options for hand detection
hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    min_hand_detection_confidence=0.6,
    min_hand_presence_confidence=0.6,
    min_tracking_confidence=0.6,
    running_mode=VisionRunningMode.IMAGE,
)

# options for pose detection
pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    min_pose_detection_confidence=0.6,
    min_pose_presence_confidence=0.6,
    min_tracking_confidence=0.6,
    running_mode=VisionRunningMode.IMAGE,
)

# create detectors
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

In [7]:
empty_hand_landmark = np.zeros((2, 21, 3))  # right hand and left hand
empty_pose_landmark = np.zeros(33 * 3)

def to_landmark_data(
    hand_results: vision.HandLandmarkerResult, pose_results: vision.PoseLandmarkerResult
):
    """
    Extract keypoints from pose and hand results for dataset creation.
    """
    pose_landmark = empty_pose_landmark
    hand_landmark = empty_hand_landmark

    if pose_results.pose_world_landmarks:
        pose_landmark = np.array(
            [[lm.x, lm.y, lm.z] for lm in pose_results.pose_world_landmarks[0]]
        ).flatten()

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate([pose_landmark, hand_landmark.flatten()])

    # iterate over the detected hand landmarks
    for index, hlm in enumerate(hand_results.hand_world_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmark[handedness] = np.array([[lm.x, lm.y, lm.z] for lm in hlm])

    return np.concatenate([pose_landmark, hand_landmark.flatten()])

LandmarkList = landmark_pb2.NormalizedLandmarkList  # aliases for landmark types
NormalizedLandmark = landmark_pb2.NormalizedLandmark  # aliases for landmark types


def to_landmark_list(landmarks):
    """
    Create a LandmarkList from a list of landmarks or fill with empty values if no landmarks are provided.
    """
    return LandmarkList(
        landmark=([NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks])
    )


empty_pose_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(33 * 4)]
)

empty_hand_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(21 * 3)]
)


def to_drawing_landmark(hand_results, pose_results):
    """
    Convert pose and hand landmarks to LandmarkList for drawing.
    """
    pose_landmarks = (
        to_landmark_list(pose_results.pose_landmarks[0])
        if pose_results.pose_landmarks
        else empty_pose_landmarks
    )

    if not hand_results:
        return pose_landmarks, None

    hand_landmarks = [empty_hand_landmarks, empty_hand_landmarks]

    # iterate over the detected hand landmarks
    for index, hand_landmark in enumerate(hand_results.hand_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmarks[handedness] = to_landmark_list(hand_landmark)

    return pose_landmarks, hand_landmarks


def draw_landmark(image, hand_landmarks, pose_landmarks):
    """
    Draw detected landmarks on the image.
    """
    drawer.draw_landmarks(
        image,
        pose_landmarks,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    if not hand_landmarks:
        return

    for hand_landmarks in hand_landmarks:
        drawer.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

#### Load Model

In [8]:
# action lables
ACTIONS = [
    "_", "Hello", "Thanks", "i-love-you", "I", "Yes", "No", "Help", "Please",
    "Want", "Eat", "More", "Bathroom", "Learn", "Sign",
]

# limit to x actions for preprocessing
# NOTE: change this number into the amount of the dataset labels (if changed)
ACTIONS = np.array(ACTIONS[:4])

In [9]:
class TFLiteModel:
    def __init__(self, prefix="singa_slr_v_"):
        self.model_dir = "../../storage/models/tflite"
        self.prefix = prefix
        self.interpreter = None
        self.input_details = None
        self.output_details = None

        self.input_shape = None
        self.output_shape = None

    def load_model(self, use_latest=True, version=""):
        if not use_latest and not version:
            model_path = os.path.join(self.model_dir, f"{self.prefix}{version}")
        else:
            model_files = os.listdir(self.model_dir)

            # filter model files by filename prefix
            versions = [file for file in model_files if file.startswith(self.prefix)]

            # extract version numbers from filenames
            versions = [file.split("_")[-1] for file in versions]

            # convert version numbers to tuples of integers for comparison
            versions_int = [tuple(map(int, v.split(".")[0])) for v in versions]

            # find the index of the latest version
            latest_index = versions_int.index(max(versions_int))

            # load the latest model
            latest_model_path = model_files[latest_index]
            model_path = os.path.join(self.model_dir, latest_model_path)

        self.interpreter = tf.lite.Interpreter(model_path=model_path)
        self.interpreter.allocate_tensors()
        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()

        self.input_shape = self.input_details[0]["index"]
        self.output_shape = self.output_details[0]["index"]

        self.print_model_details()

    def print_model_details(self):
        print("Input details:")
        pprint.pprint(self.input_details)
        print()
        print("Output details:")
        pprint.pprint(self.output_details)
        print("=" * 50)

        input_shape = self.input_details[0]["shape"]
        print("Expected input shape:", input_shape)

    def predict(self, input_data):
        self.interpreter.set_tensor(self.input_shape, input_data)
        self.interpreter.invoke()

        result = self.interpreter.get_tensor(self.output_shape)

        return result


model = TFLiteModel()
model.load_model()

Input details:
[{'dtype': <class 'numpy.float32'>,
  'index': 0,
  'name': 'serving_default_keras_tensor:0',
  'quantization': (0.0, 0),
  'quantization_parameters': {'quantized_dimension': 0,
                              'scales': array([], dtype=float32),
                              'zero_points': array([], dtype=int32)},
  'shape': array([  1,  60, 225]),
  'shape_signature': array([ -1,  60, 225]),
  'sparsity_parameters': {}}]

Output details:
[{'dtype': <class 'numpy.float32'>,
  'index': 68,
  'name': 'StatefulPartitionedCall_1:0',
  'quantization': (0.0, 0),
  'quantization_parameters': {'quantized_dimension': 0,
                              'scales': array([], dtype=float32),
                              'zero_points': array([], dtype=int32)},
  'shape': array([1, 4]),
  'shape_signature': array([-1,  4]),
  'sparsity_parameters': {}}]
Expected input shape: [  1  60 225]


### Model Prediction

In [10]:
def process_frame(frame, image, threshold, skip_word):
    start_time = time.time()

    # Convert into mediapipe numpy type support uint8, uint16, or float32
    image = image.astype(np.uint8)

    # Convert cv image to mediapipe image format before being passed to detectors
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

    try:
        hand_results = hand_detector.detect(image=mp_image)
        pose_results = pose_detector.detect(image=mp_image)

        landmarks = to_landmark_data(hand_results, pose_results)
    except:
        print(f"frame {frame} skipped")
        return frame, None, time.time() - start_time

    return frame, landmarks, time.time() - start_time


def predict_from_video(vid):
    clip = VideoFileClip(vid)

    avg_exec_time = []

    predictions = []
    sequences = []

    sentence = []
    threshold = 0.2
    skip_word = "_"

    results = []
    batch_size = 60

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_frame = {
            executor.submit(
                process_frame,
                frame,
                image,
                threshold,
                skip_word,
            ): frame
            for frame, image in enumerate(clip.iter_frames(fps=clip.fps))
        }

        for future in concurrent.futures.as_completed(future_to_frame):
            frame, landmarks, exec_time = future.result()
            avg_exec_time.append(exec_time)

            if landmarks is not None:
                results.append((frame, landmarks))

    # sort the results by frame number to ensure the order is correct
    results.sort(key=lambda x: x[0])

    for _, landmarks in results:
        sequences.append(landmarks)

        if len(sequences) < batch_size:
            continue

        # collect a batch of sequences
        batch_motion = np.stack(sequences[-batch_size:]).astype(np.float32)
        sequences = sequences[
            -(batch_size - 40) :
        ]  # keep the last 20 sequences for overlap

        # ensure correct input shape by adding an extra dimension for batch size
        batch_motion = np.expand_dims(batch_motion, axis=0)

        # predict the entire batch
        batch_result = model.predict(batch_motion)

        print(batch_result)

        for result in batch_result:
            # len of results is 480 (which is the total frame)?
            predicted = np.argmax(result)

            if (not result[predicted] > threshold) or not (
                ACTIONS[predicted] != skip_word
            ):
                continue

            if not predictions or predicted != predictions[-1]:
                predictions.append(predicted)

    print("===")
    print(predictions)
    print("===")

    for motion in predictions:
        sentence.append(ACTIONS[motion])

    return (
        sentence,
        len(results),
        {
            "avg_exec_time": avg_exec_time,
            "total_exec_time": sum(avg_exec_time),
        },
    )

In [11]:
sentence, frame, exec_time = predict_from_video("./videos/test_7.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

# [7] hello, ily, ty
# [8] hello, ily, hello, ily, ty

[[1.7529997e-08 9.9999416e-01 2.5141139e-08 5.8510623e-06]]
[[2.4833531e-08 9.9999154e-01 2.6527182e-08 8.5217362e-06]]
[[1.2964776e-05 9.9575233e-01 4.0977676e-03 1.3703598e-04]]
[[9.999311e-01 7.603475e-09 6.263397e-05 6.314619e-06]]
[[2.0190768e-04 1.0966036e-03 1.6786222e-01 8.3083922e-01]]
[[2.1541487e-06 7.8220285e-07 2.1297728e-06 9.9999499e-01]]
[[1.4918790e-05 1.1484728e-05 8.2987797e-05 9.9989057e-01]]
[[9.9935776e-01 1.2402447e-09 3.1079790e-07 6.4194395e-04]]
[[9.9871862e-01 1.5333198e-06 5.3564180e-04 7.4422621e-04]]
[[3.2106765e-08 3.1373168e-07 9.9999321e-01 6.4372425e-06]]
[[0.42401055 0.00159795 0.02899794 0.5453936 ]]
===
[1, 3, 2, 3]
===
Total frame calculated: 480
Average execution time per frame: 0.27807007332642875
Predicted sentence: ['hello', 'i-love-you', 'thanks', 'i-love-you']


In [12]:
sentence, frame, exec_time = predict_from_video("./videos/test_6.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

# [1] ily
# [2] ily
# [3]
# [4]
# [5] ily
# [6] ily
# [7] hello, ily, ty
# [8] hello, ily, hello, ily, ty

[[5.7601414e-06 2.6713902e-04 8.9478322e-05 9.9963760e-01]]
===
[3]
===
Total frame calculated: 72
Average execution time per frame: 0.24871366884973314
Predicted sentence: ['i-love-you']
