In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
%pip install tensorflow mediapipe opencv-python




In [5]:
import cv2  # type: ignore
import os
import time
import numpy as np  # type: ignore

import mediapipe as mp  # type: ignore

from matplotlib import pyplot as plt # type: ignore
from mediapipe.tasks import python  # type: ignore
from mediapipe.tasks.python import vision  # type: ignore
from mediapipe.framework.formats import landmark_pb2 # type: ignore

In [6]:
drawer = mp.solutions.drawing_utils # drawing utilities
VisionRunningMode = mp.tasks.vision.RunningMode

In [7]:
face_base_options = python.BaseOptions(model_asset_path="./tasks/face_landmarker.task")
hand_base_options = python.BaseOptions(model_asset_path="./tasks/hand_landmarker.task")
pose_base_options = python.BaseOptions(model_asset_path="./tasks/pose_landmarker.task")

In [8]:
face_options = vision.FaceLandmarkerOptions(
    base_options=face_base_options,
    output_face_blendshapes=True,
    output_facial_transformation_matrixes=True,
    num_faces=1,
    running_mode=VisionRunningMode.VIDEO,
)

hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    running_mode=VisionRunningMode.VIDEO,
)

pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    running_mode=VisionRunningMode.VIDEO,
)


face_detector = vision.FaceLandmarker.create_from_options(face_options)
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

In [9]:
LandmarkList = landmark_pb2.NormalizedLandmarkList
NormalizedLandmark = landmark_pb2.NormalizedLandmark


def create_landmark_list(landmarks, num_keypoints):
    """Creates a LandmarkList protocol buffer from a list of landmarks or fills with empty values if no landmarks are provided.

    Args:
        landmarks: A list of landmark objects, each containing x, y, z coordinates.
        num_keypoints: The number of keypoints to be included in the LandmarkList.

    Returns:
        A LandmarkList containing the converted landmarks or empty values if no landmarks are provided.
    """
    # generate empty landmarks with all coordinates set to 0.0
    empty_landmarks = [
        NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(num_keypoints)
    ]

    return LandmarkList(
        landmark=(
            # convert provided landmarks to NormalizedLandmark objects or use empty landmarks
            [NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks]
            if landmarks
            else empty_landmarks
        )
    )


def extract_keypoints_for_drawing(face_results, pose_results, hand_results):
    """Converts face, pose, and hand landmarks to corresponding protocol buffer lists for drawing.

    Args:
        face_results: Object containing face landmark detection results.
        pose_results: Object containing pose landmark detection results.
        hand_results: Object containing hand landmark detection results.

    Returns:
        A tuple containing three LandmarkList messages: face_landmarks, pose_landmarks, and hand_landmarks.
    """
    # convert face landmarks to LandmarkList, using empty values if no landmarks are present
    face_landmarks_proto = create_landmark_list(
        face_results.face_landmarks[0] if face_results.face_landmarks else None, 478 * 3
    )

    # convert pose landmarks to LandmarkList, using empty values if no landmarks are present
    pose_landmarks_proto = create_landmark_list(
        pose_results.pose_landmarks[0] if pose_results.pose_landmarks else None, 33 * 4
    )

    # convert hand landmarks to LandmarkList, using empty values if no landmarks are present
    hand_landmarks_proto = [
        create_landmark_list(hand_landmarks, 21 * 3)
        for hand_landmarks in (
            hand_results.hand_landmarks
            if hand_results.hand_landmarks
            else [None, None]  # two hands
        )
    ]

    return face_landmarks_proto, pose_landmarks_proto, hand_landmarks_proto

In [10]:
def extract_keypoints(face_results, pose_results, hand_results):
    """Extracts keypoints from face, pose, and hand results for dataset creation.

    Handles cases with zero, one, or two hands, assigning hand keypoints based
    on handedness information.

    Args:
      face_results: Object containing face landmark data (if available), assumed to
                    have a `face_landmarks` attribute with landmark data.
      pose_results: Object containing pose landmark data (if available), assumed to
                    have a `pose_landmarks` attribute with landmark data.
      hand_results: Object containing hand landmark data (if available), assumed to
                    have `hand_landmarks` and `handedness` attributes.

    Returns:
      A tuple containing three NumPy arrays representing flattened keypoints for face,
      pose, and hand, respectively. Empty arrays are used for missing modalities.
    """

    # extract face keypoints if available, otherwise return a zero-filled array
    face_keypoints = (
        np.array(
            [
                [landmark.x, landmark.y, landmark.z]
                for landmark in face_results.face_landmarks[0]
            ]
        ).flatten()
        if face_results.face_landmarks
        else np.zeros(478 * 3)  # 478 landmarks with 3 coordinates each (x, y, z)
    )

    # extract pose keypoints if available, otherwise return a zero-filled array
    pose_keypoints = (
        np.array(
            [
                [landmark.x, landmark.y, landmark.z, landmark.visibility]
                for landmark in pose_results.pose_landmarks[0]
            ]
        ).flatten()
        if pose_results.pose_landmarks
        else np.zeros(33 * 4)  # 33 landmarks with 4 values each (x, y, z, visibility)
    )

    # initialize hand keypoints with zeros for two hands (right and left),
    # each with 21 landmarks and 3 coordinates
    hand_keypoints = np.zeros((2, 21, 3))

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate(
            [face_keypoints, pose_keypoints, hand_keypoints.flatten()]
        )

    # iterate over the detected hand landmarks
    for idx in range(len(hand_results.hand_landmarks)):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[idx][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_keypoints[handedness] = np.array(
            [[lm.x, lm.y, lm.z] for lm in hand_results.hand_landmarks[idx]]
        )

    # flatten the hand keypoints array and concatenate it with face and pose keypoints
    return np.concatenate([face_keypoints, pose_keypoints, hand_keypoints.flatten()])

In [11]:
def draw_detection_landmark(
    image,
    face_landmarks_proto=None,
    pose_landmarks_proto=None,
    hand_landmarks_proto=None,
):
    # draw landmark face
    drawer.draw_landmarks(
        image,
        face_landmarks_proto,
        mp.solutions.face_mesh.FACEMESH_CONTOURS,
        drawer.DrawingSpec(color=(80, 60, 20), thickness=1, circle_radius=1),
        drawer.DrawingSpec(color=(80, 146, 241), thickness=1, circle_radius=1),
    )

    # draw landmark pose
    drawer.draw_landmarks(
        image,
        pose_landmarks_proto,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    # draw landmark for both hand (right, left)
    for idx in range(len(hand_landmarks_proto)):
        drawer.draw_landmarks(
            image,
            hand_landmarks_proto[idx],
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

In [12]:
def calculate_fps(start_time, frames):
    elapsed_time = time.time() - start_time
    return frames / elapsed_time if elapsed_time > 0 else 0


def draw_fps(image, fps):
    cv2.putText(
        image,
        f"FPS: {round(fps, 2)}",
        (10, 40),
        cv2.FONT_HERSHEY_SIMPLEX,
        1.5,
        (0, 255, 0),
        2,
    )

In [13]:
colors = [(245, 117, 16), (117, 245, 16), (16, 117, 245),
          (117, 117, 16), (16, 245, 117), (245, 117, 245)]

def confidence_bar(res, actions, input_frame, colors):
    output_frame = input_frame.copy()

    for num, prob in enumerate(res):
        cv2.rectangle(
            output_frame,
            (0, 60 + num * 40),
            (int(prob * 100), 90 + num * 40),
            colors[num],
            -1,
        )

        cv2.putText(
            output_frame,
            actions[num],
            (0, 85 + num * 40),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (255, 255, 255),
            2,
            cv2.LINE_AA,
        )

    return output_frame

In [14]:
# sign action to be detected
ACTIONS = np.array(
    [
        "hello",
        "thanks",
        "i-love-you",
        "see-you-later",
        "I",
        "Father",
        "Mother",
        "Yes",
        "No",
        "Help",
        "Please",
        "Want",
        "What",
        "Again",
        "Eat",
        "Milk",
        "More",
        "Go To",
        "Bathroom",
        "Fine",
        "Like",
        "Learn",
        "Sign",
        "Done",
    ]
)

ACTIONS = ACTIONS[:6]

In [15]:
ACTIONS

array(['hello', 'thanks', 'i-love-you', 'see-you-later', 'I', 'Father'],
      dtype='<U13')

In [16]:
from tensorflow.keras.models import Sequential  # type: ignore
from tensorflow.keras.layers import BatchNormalization, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, TimeDistributed, Reshape, Bidirectional  # type: ignore
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau  # type: ignore
from tensorflow.keras.regularizers import l2  # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore

In [17]:
# the input shape (30, 1692) where 30 is the sequence length and 1692 is the number of features per frame
input_shape = (30, 1692)

In [18]:
model = Sequential()

# data normalization
model.add(BatchNormalization(input_shape=input_shape, name='batch_normalization'))

# convolutional layers to extract spatial features
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', name='conv1d'))
model.add(MaxPooling1D(pool_size=2, name='max_pooling1d'))
model.add(Dropout(0.3, name='dropout1'))

model.add(Conv1D(filters=128, kernel_size=3, activation='relu', name='conv1d_1'))
model.add(MaxPooling1D(pool_size=2, name='max_pooling1d_1'))
model.add(Dropout(0.3, name='dropout2'))

model.add(Conv1D(filters=256, kernel_size=3, activation='relu', name='conv1d_2'))
model.add(MaxPooling1D(pool_size=2, name='max_pooling1d_2'))
model.add(Dropout(0.4, name='dropout3'))

# dense layer before LSTM
model.add(Dense(64, activation='relu', name='dense'))
model.add(Dropout(0.5, name='dropout4'))

# single Bidirectional LSTM layer
model.add(Bidirectional(LSTM(512, return_sequences=False), name='bidirectional'))

# fully connected layers for classification
model.add(Dense(128, activation='relu', name='dense_1'))
model.add(Dropout(0.5, name='dropout5'))

model.add(Dense(64, activation='relu', name='dense_2'))
model.add(Dropout(0.3, name='dropout6'))

# output layer with softmax activation for classification
model.add(Dense(6, activation='softmax', name='dense_3'))

# Load pre-trained weights
model.load_weights("../models/legacy/asl-action-cnn-lstm_1l-6a-es_p30__rlr_f05_p10_lr1e5-2.9M.h5")

# Freeze all layers except the last few
# for layer in model.layers[:-6]:
#     layer.trainable = False






In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (Batch  (None, 30, 1692)          6768      
 Normalization)                                                  
                                                                 
 conv1d (Conv1D)             (None, 28, 64)            324928    
                                                                 
 max_pooling1d (MaxPooling1  (None, 14, 64)            0         
 D)                                                              
                                                                 
 dropout1 (Dropout)          (None, 14, 64)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 12, 128)           24704     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 6, 128)            0

In [20]:
%pip install graphviz pydot

Note: you may need to restart the kernel to use updated packages.


In [23]:
import numpy as np

window = []

for i in range(30):
    npy_path = os.path.join(os.path.join("./datasets"), "i-love-you", "5", f"{i}.npy")

    # load the frame data from the numpy file
    seq = np.load(npy_path)

    window.append(seq)

action = np.array(window)

pred = model.predict(np.expand_dims(action, axis=0))[0]

print(ACTIONS[np.argmax(pred)])

i-love-you


In [None]:
sequences = []
sequence = []

sentence = []
predictions = []

sequence_length = 30
threshold = 0.5

In [None]:
cap = cv2.VideoCapture(0)

# set capture properties
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 600)  # set width to 600 pixels
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 600)  # set height to 600 pixels
cap.set(cv2.CAP_PROP_FPS, 60)  # set frame rate to 60 FPS

start_time = time.time()
isQuit = False

while cap.isOpened():
    success, image = cap.read()

    if not success:
        print("Ignoring empty camera frame.")
        continue

    # NOTE: using flip image will screw'ed up some of the keypoints
    #       data for training the model later
    # image = cv2.flip(image, 1) # flip the image horizontally for a selfie-view display.

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # get current frame timestamp in milliseconds
    timestamp_ms = int(cap.get(cv2.CAP_PROP_POS_MSEC))

    # convert cv image to mediapipe image format before being
    # passed to face, pose and hand detector
    annotated_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image_rgb)

    face_results = face_detector.detect_for_video(
        image=annotated_image, timestamp_ms=timestamp_ms
    )

    hand_results = hand_detector.detect_for_video(
        image=annotated_image, timestamp_ms=timestamp_ms + 1
    )

    pose_results = pose_detector.detect_for_video(
        image=annotated_image, timestamp_ms=timestamp_ms + 2
    )

    keypoints = extract_keypoints(face_results, pose_results, hand_results)
    sequences.append(keypoints)
    sequence = sequences[-30:]

    face_proto, pose_proto, hand_proto = extract_keypoints_for_drawing(
        face_results, pose_results, hand_results
    )

    draw_detection_landmark(
        image_rgb,
        face_landmarks_proto=face_proto,
        pose_landmarks_proto=pose_proto,
        hand_landmarks_proto=hand_proto,
    )

    if len(sequence) == sequence_length:
        # predict the action label based on the sequence of keypoints
        result = model.predict(
            np.expand_dims(
                sequence, axis=0
            )  # expanded to include a batch dimension before fed to the model
        )[0]

        # action class with the highest confidence score
        predictions.append(np.argmax(result))

        # NOTE: If the current prediction matches the most common prediction over the last 20 frames,
        #       it suggests that the current action is likely intentional and
        #       consistent with recent actions, rather than a momentary anomaly.
        if np.unique(predictions[-10:])[0] == np.argmax(result):

            # check if the confidence score of the current prediction index is above the threshold.
            if result[np.argmax(result)] > threshold:

                # checks if there are any elements in the sentence list.
                # If it's not empty, it means there are already recognized actions in the sentence.
                if len(sentence) > 0:
                    # compares the current predicted action
                    if ACTIONS[np.argmax(result)] != sentence[-1]:
                        sentence.append(ACTIONS[np.argmax(result)])
                else:
                    # no recognized actions yet
                    sentence.append(ACTIONS[np.argmax(result)])

        # limit the length of the recognized action sentence to 5 elements by
        # keeping only the last two elements so it does not exceed the text box
        if len(sentence) > 3:
            sentence = sentence[-3:]

        # overlay the predicted action on the image
        image_rgb = confidence_bar(result, ACTIONS, image_rgb, colors)

    cv2.rectangle(image_rgb, (0, 0), (640, 40), (245, 117, 16), -1)
    cv2.putText(
        image_rgb,
        " ".join(sentence),
        (3, 30),
        cv2.FONT_HERSHEY_SIMPLEX,
        1,
        (255, 255, 255),
        2,
        cv2.LINE_AA,
    )

    cv2.imshow(
        "MediaPipe Detection",
        cv2.cvtColor(image_rgb, cv2.COLOR_BGR2RGB),
    )

    if cv2.waitKey(10) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()





In [None]:
sequences[:30]

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([ 0.6738947 ,  1.04895377, -0.01673681, ...,  0.        ,
         0.        ,  0.        ]),
 array([ 0.67149955,  1.03560805, -0.01615892, ...,  0.        ,
         0.        ,  0.        ]),
 array([ 0.66170722,  1.03462029, -0.01478392, ...,  0.        ,
         0.        ,  0.        ]),
 array([ 0.66107965,  1.02624309, -0.01210512, ...,  0.        ,
         0.        ,  0.        ]),
 array([ 0.66148943,  1.03649485, -0.01243832, ...,  0.        ,
         0.        ,  0.        ]),
 array([ 0.65855277,  1.03850925, -0.00988946, ...,  0.        ,
         0.        ,  0.        ]),
 array([ 0.65870565,  1.0386132 , -0.00974403, ...,  0.        ,
         0.        ,  0.        ]),
 array([ 0.65872282,  1.03686655, -0.00771972, ...,  0.        ,
         0.        ,  0.        ]),
 array([ 0.66121763,  1.03527975, -0.006984  , ...,  0.        ,
         0.        ,  0.        ]),
 array([ 0.66111922,  1.03363144, -0.00708904, ..., 

In [None]:
np.expand_dims(sequence, axis=0)

array([[[ 0.57280719,  0.52622569, -0.03732424, ...,  0.57391572,
          0.58559418, -0.10670094],
        [ 0.57030427,  0.52614331, -0.03747835, ...,  0.57410204,
          0.58424765, -0.10982773],
        [ 0.5687421 ,  0.52671403, -0.03601618, ...,  0.57232231,
          0.58869779, -0.10868381],
        ...,
        [ 0.60675806,  0.57286114, -0.04001639, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.60902882,  0.57445043, -0.03971909, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.60947293,  0.57736653, -0.04024295, ...,  0.        ,
          0.        ,  0.        ]]])

In [None]:
pred = model.predict(np.expand_dims(sequence, axis=0))[0]

print(ACTIONS[np.argmax(pred)])

i-love-you
