In [25]:
import os
import mediapipe as mp
import numpy as np
import tensorflow as tf
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

#### Setting up for Mediapipe

In [26]:
drawer = mp.solutions.drawing_utils
VisionRunningMode = mp.tasks.vision.RunningMode

In [27]:
# base options for hand and pose detection models
hand_base_options = python.BaseOptions(model_asset_path="../tasks/hand_landmarker.task")
pose_base_options = python.BaseOptions(model_asset_path="../tasks/pose_landmarker.task")

In [28]:
# options for hand detection
hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    min_hand_detection_confidence=0.6,
    min_hand_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# options for pose detection
pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    min_pose_detection_confidence=0.6,
    min_pose_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# create detectors
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

#### Extracting Mediapipe Landmark

In [29]:
empty_hand_landmark = np.zeros((2, 21, 3))  # right hand and left hand
empty_pose_landmark = np.zeros(33 * 3)

def to_landmark_data(
    hand_results: vision.HandLandmarkerResult, pose_results: vision.PoseLandmarkerResult
):
    """
    Extract keypoints from pose and hand results for dataset creation.
    """
    pose_landmark = empty_pose_landmark
    hand_landmark = empty_hand_landmark

    if pose_results.pose_world_landmarks:
        pose_landmark = np.array(
            [[lm.x, lm.y, lm.z] for lm in pose_results.pose_world_landmarks[0]]
        ).flatten()

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate([pose_landmark, hand_landmark.flatten()])

    # iterate over the detected hand landmarks
    for index, hlm in enumerate(hand_results.hand_world_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmark[handedness] = np.array([[lm.x, lm.y, lm.z] for lm in hlm])

    return np.concatenate([pose_landmark, hand_landmark.flatten()])

LandmarkList = landmark_pb2.NormalizedLandmarkList  # aliases for landmark types
NormalizedLandmark = landmark_pb2.NormalizedLandmark  # aliases for landmark types


def to_landmark_list(landmarks):
    """
    Create a LandmarkList from a list of landmarks or fill with empty values if no landmarks are provided.
    """
    return LandmarkList(
        landmark=([NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks])
    )


empty_pose_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(33 * 4)]
)

empty_hand_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(21 * 3)]
)


def to_drawing_landmark(hand_results, pose_results):
    """
    Convert pose and hand landmarks to LandmarkList for drawing.
    """
    pose_landmarks = (
        to_landmark_list(pose_results.pose_landmarks[0])
        if pose_results.pose_landmarks
        else empty_pose_landmarks
    )

    hand_landmarks = [empty_hand_landmarks, empty_hand_landmarks]

    if not hand_results:
        return pose_landmarks, None

    # iterate over the detected hand landmarks
    for index, hand_landmark in enumerate(hand_results.hand_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmarks[handedness] = to_landmark_list(hand_landmark)

    return pose_landmarks, hand_landmarks


def draw_landmark(image, hand_landmarks, pose_landmarks):
    """
    Draw detected landmarks on the image.
    """
    drawer.draw_landmarks(
        image,
        pose_landmarks,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    if not hand_landmarks:
        return

    for hand_landmarks in hand_landmarks:
        drawer.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

#### Load Model

In [30]:
# action lables
ACTIONS = np.array([
    "_", "hello", "what's up", "how",
    "thanks", "you", "morning", "afternoon",
    "night", "me", "name", "fine",
    "happy", "yes", "no", "repeat",
    "please", "want", "good bye", "learn",
])

val_path = "../../storage/datasets/raw/"

In [31]:
def load_model(model_version=None):
    model_dir = "../../storage/models/keras"
    prefix = "singa_slr_v_"

    if model_version:
        version = f"{prefix}{model_version}.keras"
        ks_file = os.path.join(model_dir, version)

        model = tf.keras.models.load_model(ks_file)

        return version, model

    model_files = os.listdir(model_dir)

    # filter model files by filename prefix
    versions = [file for file in model_files if file.startswith(prefix)]

    # extract version numbers from filenames
    versions = [file.split("_")[-1] for file in versions]

    # convert version numbers to tuples of integers for comparison
    versions_int = [tuple(map(int, v.split(".")[0])) for v in versions]

    # find the index of the latest version
    latest_index = versions_int.index(max(versions_int))

    # load the latest model
    latest_model_path = model_files[latest_index]

    model = tf.keras.models.load_model(os.path.join(model_dir, latest_model_path))

    return latest_model_path, model


v, model = load_model()

In [32]:
f"using model {v}"

'using model singa_slr_v_014.keras'

In [33]:
model.summary()

#### Model Prediction

In [34]:
import numpy as np
import time
import concurrent.futures
from moviepy.editor import VideoFileClip

In [35]:
def process_frame(frame, image):
    start_time = time.time()

    # convert into mediapipe numpy type support uint8, uint16, or float32
    image = np.fliplr(image)
    image = image.astype(np.uint8)

    # convert cv image to mediapipe image format before being passed to detectors
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

    hand_results = hand_detector.detect(image=mp_image)
    pose_results = pose_detector.detect(image=mp_image)

    landmarks = to_landmark_data(hand_results, pose_results)

    return frame, landmarks, time.time() - start_time

In [36]:
def predict_from_video(vid):
    clip = VideoFileClip(vid)

    avg_exec_time = []

    predictions = []
    sequences = []

    sentence = []
    threshold = 0.99
    skip_word = "_"

    results = []
    batch_size = 60

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_frame = {
            executor.submit(
                process_frame,
                frame,
                image,
            ): frame
            for frame, image in enumerate(clip.iter_frames(fps=clip.fps))
        }

        for future in concurrent.futures.as_completed(future_to_frame):
            frame, landmarks, exec_time = future.result()
            avg_exec_time.append(exec_time)

            if landmarks is not None:
                results.append((frame, landmarks))

    clip.close()

    start_time = time.time()

    # sort the results by frame number to ensure the order is correct
    results.sort(key=lambda x: x[0])

    for _, landmarks in results:
        sequences.append(landmarks)

        if len(sequences) < batch_size:
            continue

        # ensure correct input shape by adding an extra dimension for batch size
        batch_motion = np.expand_dims(np.stack(sequences[-batch_size:]), axis=0)

        # predict the motion
        result = model.predict(batch_motion, verbose=0)[0]

        # get the predicted class and its confidence
        predicted = np.argmax(result)
        confidence = result[predicted]

        # append to the predictions and accuracies list
        predictions.append(predicted)

        # only keep the last 20 predictions and their accuracies
        # predictions = predictions[-20:]

        predicted_sentence = ACTIONS[predicted]

        # determine most frequent prediction
        most_frequent_prediction = np.bincount(predictions[-10:]).argmax()
        print(most_frequent_prediction, "\t", result[predicted], "\t", predicted_sentence)

        if most_frequent_prediction != predicted:
            continue

        elif confidence < threshold:
            continue

        elif predicted_sentence == skip_word:
            continue

        elif not sentence or predicted_sentence != sentence[-1]:
            # print(confidence, "\t", predicted_sentence, "\t\t", result)
            sentence.append(predicted_sentence)

    end_time = time.time() - start_time

    return (
        sentence,
        len(results),
        {
            "pred_exec_time": end_time,
            "avg_exec_time": avg_exec_time,
            "total_exec_time": sum(avg_exec_time),
        },
    )

#### Video 1

In [37]:
sentence, frame, exec_time = predict_from_video("./demo/1.mp4")
true_sentence_1 = ["hello", "what's up", "how", "you", "morning"]

print("=" * 50)
print("Total frame calculated:", frame)
print("Total prediction execution:", np.mean(exec_time["pred_exec_time"]))
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

1 	 0.9992785 	 hello
1 	 0.99926275 	 hello
1 	 0.9992368 	 hello
1 	 0.9992424 	 hello
1 	 0.99926 	 hello
1 	 0.9991985 	 hello
1 	 0.99917954 	 hello
1 	 0.9992035 	 hello
1 	 0.99924195 	 hello
1 	 0.99919814 	 hello
1 	 0.99920756 	 hello
1 	 0.9992268 	 hello
1 	 0.9992692 	 hello
1 	 0.99922085 	 hello
1 	 0.99922085 	 hello
1 	 0.99923134 	 hello
1 	 0.9992576 	 hello
1 	 0.9991997 	 hello
1 	 0.99920374 	 hello
1 	 0.99919564 	 hello
1 	 0.9992204 	 hello
1 	 0.9991703 	 hello
1 	 0.9991666 	 hello
1 	 0.99914145 	 hello
1 	 0.99912924 	 hello
1 	 0.99907863 	 hello
1 	 0.9990952 	 hello
1 	 0.99907625 	 hello
1 	 0.9990767 	 hello
1 	 0.9990202 	 hello
1 	 0.99901223 	 hello
1 	 0.9989292 	 hello
1 	 0.99887866 	 hello
1 	 0.99879324 	 hello
1 	 0.9987826 	 hello
1 	 0.99867105 	 hello
1 	 0.99863166 	 hello
1 	 0.9984946 	 hello
1 	 0.99836546 	 hello
1 	 0.9980957 	 hello
1 	 0.99792045 	 hello
1 	 0.997609 	 hello
1 	 0.9973469 	 hello
1 	 0.9967796 	 hello
1 	 0.99643743

In [38]:
true_sentence_1 == sentence

True

#### Video 2

In [40]:
sentence, frame, exec_time = predict_from_video("./demo/2.mp4")
true_sentence_2 = ["you", "please", "repeat", "you", "name"]

print("=" * 50)
print("Total frame calculated:", frame)
print("Total prediction execution:", np.mean(exec_time["pred_exec_time"]))
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

5 	 0.999992 	 you
5 	 0.9999925 	 you
5 	 0.99999285 	 you
5 	 0.9999927 	 you
5 	 0.99999297 	 you
5 	 0.9999933 	 you
5 	 0.9999937 	 you
5 	 0.99999297 	 you
5 	 0.9999924 	 you
5 	 0.99999285 	 you
5 	 0.99999285 	 you
5 	 0.99999285 	 you
5 	 0.9999927 	 you
5 	 0.99999344 	 you
5 	 0.9999937 	 you
5 	 0.9999933 	 you
5 	 0.9999931 	 you
5 	 0.99999356 	 you
5 	 0.9999938 	 you
5 	 0.9999939 	 you
5 	 0.9999938 	 you
5 	 0.99999416 	 you
5 	 0.99999416 	 you
5 	 0.9999938 	 you
5 	 0.9999931 	 you
5 	 0.99999356 	 you
5 	 0.9999937 	 you
5 	 0.9999938 	 you
5 	 0.9999937 	 you
5 	 0.9999938 	 you
5 	 0.9999938 	 you
5 	 0.9999938 	 you
5 	 0.9999932 	 you
5 	 0.9999932 	 you
5 	 0.9999931 	 you
5 	 0.9999932 	 you
5 	 0.99999297 	 you
5 	 0.9999926 	 you
5 	 0.99999213 	 you
5 	 0.99999166 	 you
5 	 0.99998975 	 you
5 	 0.9999889 	 you
5 	 0.99998784 	 you
5 	 0.9999875 	 you
5 	 0.9999857 	 you
5 	 0.9999851 	 you
5 	 0.9999832 	 you
5 	 0.9999808 	 you
5 	 0.9999746 	 you
5 	 0

In [41]:
true_sentence_2 == sentence

True

#### Video 3

In [50]:
sentence, frame, exec_time = predict_from_video("./demo/3.mp4")

print("=" * 50)
print("Total frame calculated:", frame)
print("Total prediction execution:", np.mean(exec_time["pred_exec_time"]))
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

18 	 0.9916893 	 good bye
18 	 0.9930228 	 good bye
18 	 0.9943435 	 good bye
18 	 0.9950846 	 good bye
18 	 0.9957282 	 good bye
18 	 0.9965054 	 good bye
18 	 0.9973289 	 good bye
18 	 0.9974897 	 good bye
18 	 0.9974642 	 good bye
18 	 0.9978586 	 good bye
18 	 0.9982773 	 good bye
18 	 0.99847215 	 good bye
18 	 0.99854255 	 good bye
18 	 0.99864537 	 good bye
18 	 0.99881005 	 good bye
18 	 0.9988562 	 good bye
18 	 0.9987594 	 good bye
18 	 0.99878234 	 good bye
18 	 0.99888617 	 good bye
18 	 0.9989293 	 good bye
18 	 0.9988789 	 good bye
18 	 0.9988888 	 good bye
18 	 0.9989513 	 good bye
18 	 0.99897563 	 good bye
18 	 0.9989145 	 good bye
18 	 0.9989114 	 good bye
18 	 0.9989471 	 good bye
18 	 0.998946 	 good bye
18 	 0.99885094 	 good bye
18 	 0.9988532 	 good bye
18 	 0.9988733 	 good bye
18 	 0.99886465 	 good bye
18 	 0.99876714 	 good bye
18 	 0.99874496 	 good bye
18 	 0.99868995 	 good bye
18 	 0.99864465 	 good bye
18 	 0.9984939 	 good bye
18 	 0.9985032 	 good bye
