In [1]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/model')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install mediapipe


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from hand_landmarks_detector import mp_process, get_hand
import pandas as pd
import cv2
import mediapipe as mp
import numpy as np
import pickle
import params
import os
from os.path import isfile, join
import time


# Input: frames
# Output: verb

VISUALIZE = False

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
labels = params.classes


def read_image_files(dir):
    image_files = [f for f in os.listdir(dir) if isfile(join(dir, f))]
    return image_files


def process_verb(image_batch, model, labels):
    # remeber to check whether the model is correcly saved!!
    # if check_batches_size(image_batch):
    start = time.time()
    data = process_mediapipe(image_batch)
    pred = model_predict(data, model)

    verb = get_key_from_val(labels, pred)
    end = time.time()
    t = end-start
    return t, verb


def process_mediapipe(image_batch):
    # The input should be a batch of [timestamps, images], size = model.params.time_steps
    hand_landmarks_data = []
    for [timestamp, image] in image_batch:
        output = detect_landmarks(image, timestamp)
        hand_landmarks_data.append(output)
    if hand_landmarks_data:
        hand_landmarks_data = pd.concat(hand_landmarks_data)
        # sort by timestamp
        hand_landmarks_data = hand_landmarks_data.sort_index(0)
        hand_landmarks_data = check_timestamp(hand_landmarks_data)
        # Drop the timestamp columns of dataframe
        data_appended_all = hand_landmarks_data.iloc[:, 1:]
        data_appended_all = data_window(data_appended_all)
        data_appended_all = np.asarray(data_appended_all).astype("float32")
    return data_appended_all


def df_to_add(new_ts):
    # create zero paddings for non-detection images
    df_add = {
        "timestamp": new_ts,
        "0_0_x": 0,
        "0_0_y": 0,
        "0_0_z": 0,
        "0_1_x": 0,
        "0_1_y": 0,
        "0_1_z": 0,
        "0_2_x": 0,
        "0_2_y": 0,
        "0_2_z": 0,
        "0_3_x": 0,
        "0_3_y": 0,
        "0_3_z": 0,
        "0_4_x": 0,
        "0_4_y": 0,
        "0_4_z": 0,
        "0_5_x": 0,
        "0_5_y": 0,
        "0_5_z": 0,
        "0_6_x": 0,
        "0_6_y": 0,
        "0_6_z": 0,
        "0_7_x": 0,
        "0_7_y": 0,
        "0_7_z": 0,
        "0_8_x": 0,
        "0_8_y": 0,
        "0_8_z": 0,
        "0_9_x": 0,
        "0_9_y": 0,
        "0_9_z": 0,
        "0_10_x": 0,
        "0_10_y": 0,
        "0_10_z": 0,
        "0_11_x": 0,
        "0_11_y": 0,
        "0_11_z": 0,
        "0_12_x": 0,
        "0_12_y": 0,
        "0_12_z": 0,
        "0_13_x": 0,
        "0_13_y": 0,
        "0_13_z": 0,
        "0_14_x": 0,
        "0_14_y": 0,
        "0_14_z": 0,
        "0_15_x": 0,
        "0_15_y": 0,
        "0_15_z": 0,
        "0_16_x": 0,
        "0_16_y": 0,
        "0_16_z": 0,
        "0_17_x": 0,
        "0_17_y": 0,
        "0_17_z": 0,
        "0_18_x": 0,
        "0_18_y": 0,
        "0_18_z": 0,
        "0_19_x": 0,
        "0_19_y": 0,
        "0_19_z": 0,
        "0_20_x": 0,
        "0_20_y": 0,
        "0_20_z": 0,
        "1_0_x": 0,
        "1_0_y": 0,
        "1_0_z": 0,
        "1_1_x": 0,
        "1_1_y": 0,
        "1_1_z": 0,
        "1_2_x": 0,
        "1_2_y": 0,
        "1_2_z": 0,
        "1_3_x": 0,
        "1_3_y": 0,
        "1_3_z": 0,
        "1_4_x": 0,
        "1_4_y": 0,
        "1_4_z": 0,
        "1_5_x": 0,
        "1_5_y": 0,
        "1_5_z": 0,
        "1_6_x": 0,
        "1_6_y": 0,
        "1_6_z": 0,
        "1_7_x": 0,
        "1_7_y": 0,
        "1_7_z": 0,
        "1_8_x": 0,
        "1_8_y": 0,
        "1_8_z": 0,
        "1_9_x": 0,
        "1_9_y": 0,
        "1_9_z": 0,
        "1_10_x": 0,
        "1_10_y": 0,
        "1_10_z": 0,
        "1_11_x": 0,
        "1_11_y": 0,
        "1_11_z": 0,
        "1_12_x": 0,
        "1_12_y": 0,
        "1_12_z": 0,
        "1_13_x": 0,
        "1_13_y": 0,
        "1_13_z": 0,
        "1_14_x": 0,
        "1_14_y": 0,
        "1_14_z": 0,
        "1_15_x": 0,
        "1_15_y": 0,
        "1_15_z": 0,
        "1_16_x": 0,
        "1_16_y": 0,
        "1_16_z": 0,
        "1_17_x": 0,
        "1_17_y": 0,
        "1_17_z": 0,
        "1_18_x": 0,
        "1_18_y": 0,
        "1_18_z": 0,
        "1_19_x": 0,
        "1_19_y": 0,
        "1_19_z": 0,
        "1_20_x": 0,
        "1_20_y": 0,
        "1_20_z": 0,
    }
    return df_add


def check_timestamp(df):
    # check if all timestamp is continuous
    ts = df["timestamp"].to_list()
    for idx, ts_current in enumerate(ts):
        gap = ts_current - ts[idx - 1]
        if gap != 1:
            for i in range(1, gap):
                df_add = df_to_add(int(ts[idx - 1] + i))
                df = df.append(df_add, ignore_index=True)
    df = df.sort_values(by="timestamp")
    return df


def get_key_from_val(dic, val):
    keys = [k for k, v in dic.items() if v == val]
    return keys[0]


def detect_landmarks(image, timestamp):
    results = mp_process(image)
    if results.multi_hand_landmarks:
        result_multi_landmarks = format_results_to_dataframe(results, timestamp)

        if VISUALIZE:
            visualize_hand_landmarks(image, results)
    else:
        zero_padded = df_to_add(timestamp)
        result_multi_landmarks = pd.DataFrame(zero_padded, index=[timestamp])

    return result_multi_landmarks


def format_results_to_dataframe(results, timestamp):
    data = {"timestamp": timestamp}
    for idx_hl, hand_landmarks in enumerate(results.multi_hand_landmarks):
        classification = list(results.multi_handedness[idx_hl].classification)
        # 0: Left, 1: Right
        hand = get_hand(classification)
        for idx_dp, data_point in enumerate(hand_landmarks.landmark):
            data[f"{hand}_{idx_dp}_x"] = data_point.x
            data[f"{hand}_{idx_dp}_y"] = data_point.y
            data[f"{hand}_{idx_dp}_z"] = data_point.z
    # check how many hands were detected, if only one hand was detected, fill the other as 0
    if len(data) == 21 * 3 + 2:
        idx_hl_fill = 1 - int(hand)
        for idx_dp in range(21):
            data[f"{idx_hl_fill}_{idx_dp}_x"] = 0.0
            data[f"{idx_hl_fill}_{idx_dp}_y"] = 0.0
            data[f"{idx_hl_fill}_{idx_dp}_z"] = 0.0

    data_pd = pd.DataFrame(data, index=[timestamp])
    return data_pd


def visualize_hand_landmarks(
    image,
    results,
    debug=False,
):
    image_height, image_width, _ = image.shape
    # visualize the hand landmarks to image
    annotated_image = image.copy()
    for hand_landmarks in results.multi_hand_landmarks:
        if debug:
            print("hand_landmarks:", hand_landmarks)
            print(
                f"Index finger tip coordinates: (",
                f"{hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x * image_width}, "
                f"{hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y * image_height})",
            )
        mp_drawing.draw_landmarks(
            annotated_image,
            hand_landmarks,
            mp_hands.HAND_CONNECTIONS,
            mp_drawing_styles.get_default_hand_landmarks_style(),
            mp_drawing_styles.get_default_hand_connections_style(),
        )

        # Debug
        while debug == True:
            cv2.imshow("mediapipe_hand_image", annotated_image)
            key = cv2.waitKey(1)
            # if pressed escape exit program
            if key == 27:
                cv2.destroyAllWindows()
                break



In [7]:
time_steps = 10
stride = 1

def data_window(X):
    # make a sliding window with stride and time_steps
    # turn the originally 2D dataframe into 3D for LSTM
    # this will add into the list as [[0,1...n],[10,11...n+10],...[m-10,m-9...m]]
    sequences = time_steps
    result = []
    n_stride = len(X) // stride
    for j in range(n_stride - 1):
        start = j * stride
        if start + sequences <= len(X):

            result.append(X[start : start + sequences])

    data_X = np.array(result, dtype=object)

    return data_X

def check_batches_size(image_batch):
    if len(image_batch) == time_steps:
        return True
    else:
        print(
            len(image_batch),
            "is not fit for required image batch length for current model:",
            time_steps,
        )


model_name = "/content/drive/MyDrive/model/finalized_model_5_bags_LSTM_128.sav"

model = pickle.load(open(model_name, "rb"))

def model_predict(data, model):
    data_pred = model.predict_on_batch(data)

    pred = np.array(np.argmax(data_pred, axis=1))
    return pred

dir = "/content/drive/MyDrive/model/test"

images = read_image_files(dir)
image_batch = []
for idx, file in enumerate(images):
    image = cv2.imread(os.path.join(dir, file))
    image_batch.append([idx, image])
t_mean = []
t_std = []
for j in range(10):
    print(j)
    t_long = []
    for i in range(50):
        t, verb = process_verb(image_batch, model, labels)
        print(t)
        t_long.append(t)
    t_long.pop(0)
    t_mean.append(np.mean(t_long))
    t_std.append(np.std(t_long))
print("The current action is:", verb)
print("The average inference time is:", np.min(t_mean))
print(
    "The standard deviation of inference time is:",
    t_std[t_mean.index(np.min(t_mean))],
)


0




1.144623041152954
0.8180015087127686
0.801224946975708
0.7959396839141846
0.8042192459106445
0.8067631721496582
0.7869927883148193
0.8047242164611816
0.7977995872497559
0.8184542655944824
0.8020343780517578
0.7962634563446045
0.8007934093475342
0.8059303760528564
0.8068444728851318
0.8352100849151611
0.8239288330078125
0.840569257736206
0.8007462024688721
0.8146641254425049
0.8070127964019775
0.794121503829956
0.7937288284301758
0.7988967895507812
0.788905143737793
0.7950749397277832
0.7976233959197998
0.8050639629364014
0.8414957523345947
0.8208420276641846
0.8190960884094238
0.7991864681243896
0.8006644248962402
0.8106977939605713
0.7990317344665527
0.7936975955963135
0.7989068031311035
0.7979562282562256
0.8193089962005615
0.8186347484588623
0.8086185455322266
0.811629056930542
0.803473711013794
0.8136312961578369
0.8059897422790527
0.8015453815460205
0.8190069198608398
0.8286910057067871
0.8365647792816162
0.8291637897491455
1
0.812152624130249
0.8072469234466553
0.8094496726989746