In [81]:
import cv2 as cv
import matplotlib.pyplot as plt
import os
import time
import mediapipe as mp
import numpy as np

In [82]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_face_mesh = mp.solutions.face_mesh 

In [83]:
def mediapipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
    return image, results

In [84]:
def draw_landmarks(image, result):
    if result.face_landmarks:
        mp_drawing.draw_landmarks(image, result.face_landmarks, mp_face_mesh.FACEMESH_TESSELATION)
    if result.pose_landmarks:
        mp_drawing.draw_landmarks(image, result.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    if result.left_hand_landmarks:
        mp_drawing.draw_landmarks(image, result.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    if result.right_hand_landmarks:
        mp_drawing.draw_landmarks(image, result.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)


In [85]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 


In [86]:
cap = cv.VideoCapture('sign.mp4')
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        ret, frame = cap.read()
        if not ret:
            print("Video ended or cannot read the frame.")
            break

        image, results = mediapipe_detection(frame, holistic)
            

        draw_styled_landmarks(image, results)

        cv.imshow('OpenCV feed', image)
        if cv.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv.destroyAllWindows()

I0000 00:00:1753188588.188032    9003 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1753188588.234091   91946 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.172.08), renderer: NVIDIA GeForce RTX 4070 SUPER/PCIe/SSE2
W0000 00:00:1753188588.268923   91922 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753188588.285603   91920 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753188588.286683   91934 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753188588.287262   91931 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000

In [87]:
results.right_hand_landmarks

In [88]:
frame

array([[[129, 127, 131],
        [123, 121, 125],
        [126, 124, 128],
        ...,
        [111, 115, 122],
        [111, 115, 122],
        [111, 115, 122]],

       [[130, 128, 132],
        [128, 126, 130],
        [131, 129, 133],
        ...,
        [112, 116, 123],
        [112, 116, 123],
        [112, 116, 123]],

       [[130, 128, 132],
        [133, 131, 135],
        [136, 134, 138],
        ...,
        [112, 116, 123],
        [112, 116, 123],
        [112, 116, 123]],

       ...,

       [[164, 164, 158],
        [163, 163, 157],
        [162, 162, 156],
        ...,
        [ 89, 102, 122],
        [ 86, 100, 116],
        [ 82,  96, 112]],

       [[157, 157, 151],
        [158, 158, 152],
        [161, 161, 155],
        ...,
        [ 91, 104, 124],
        [ 87, 101, 117],
        [ 83,  97, 113]],

       [[150, 150, 144],
        [152, 152, 146],
        [157, 157, 151],
        ...,
        [ 92, 105, 125],
        [ 88, 102, 118],
        [ 83,  97, 113]]

## Extracting Keypoint Values

In [89]:
results.pose_landmarks.landmark[0]

x: 0.266805142
y: 0.374002576
z: -1.24380577
visibility: 0.999544501

In [90]:
results.pose_landmarks.landmark[0].x

0.26680514216423035

In [91]:
results.pose_landmarks.landmark[0].y

0.3740025758743286

In [92]:
results.pose_landmarks.landmark[0].z

-1.2438057661056519

In [93]:
pose = []

for res in results.pose_landmarks.landmark:
   test = np.array([res.x, res.y, res.z, res.visibility])
   pose.append(test)

In [94]:
pose

[array([ 0.26680514,  0.37400258, -1.24380577,  0.9995445 ]),
 array([ 0.3214334 ,  0.3387368 , -1.26272428,  0.99940246]),
 array([ 0.34576905,  0.33926901, -1.26304495,  0.99945599]),
 array([ 0.37306494,  0.34020573, -1.26372147,  0.99952441]),
 array([ 0.27460417,  0.34077409, -1.12944484,  0.99908406]),
 array([ 0.26408359,  0.34251899, -1.12879312,  0.99904829]),
 array([ 0.25496683,  0.34405476, -1.12969458,  0.99892431]),
 array([ 0.47234085,  0.37020892, -1.06335783,  0.99963379]),
 array([ 0.31090054,  0.36999232, -0.46520954,  0.99900341]),
 array([ 0.31168774,  0.41561049, -1.17152596,  0.99978572]),
 array([ 0.25536445,  0.41506419, -0.99469882,  0.99953556]),
 array([ 0.75988311,  0.62633216, -0.89728701,  0.99895972]),
 array([ 0.14279129,  0.60651755, -0.13084136,  0.99665022]),
 array([ 0.89386326,  0.91334403, -0.97746664,  0.59083718]),
 array([-0.00314824,  0.85304242, -0.101729  ,  0.23863578]),
 array([ 1.02429926,  1.1715461 , -1.31900537,  0.10110939]),
 array([

In [95]:
pose = np.array(pose)

In [96]:
pose.shape

(33, 4)

In [97]:
pose.flatten().shape

(132,)

In [98]:
pose = pose.flatten()

In [99]:
pose.shape

(132,)

In [100]:
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

In [101]:
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)

In [102]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [103]:
extract_keypoints(results).shape

(1662,)

In [104]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data') 

# Actions that we try to detect
actions = np.array(['hello', 'thanks', 'iloveyou'])

# Thirty videos worth of data
no_sequences = 30

# Videos are going to be 30 frames in length
sequence_length = 30



In [105]:
DATA_PATH = 'MP_Data'

# Automatically gather class labels from folder names
actions = np.array(sorted([folder for folder in os.listdir(DATA_PATH) if os.path.isdir(os.path.join(DATA_PATH, folder))]))
print("Detected actions:", actions)

Detected actions: ['bye' 'copy' 'disgust' 'eternity' 'hello' 'human' 'love' 'thanks']


In [106]:
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     for action in actions:
#         action_path = os.path.join(DATA_PATH, action)
#         video_files = [f for f in os.listdir(action_path) if f.endswith('.mp4')]

#         for sequence, video_file in enumerate(video_files):
#             cap = cv.VideoCapture(os.path.join(action_path, video_file))
#             frame_num = 0

#             while cap.isOpened() and frame_num < sequence_length:
#                 ret, frame = cap.read()
#                 if not ret:
#                     break

#                 _, results = mediapipe_detection(frame, holistic)
#                 keypoints = extract_keypoints(results)

#                 # Save keypoints as .npy
#                 sequence_dir = os.path.join(DATA_PATH, action, str(sequence))
#                 os.makedirs(sequence_dir, exist_ok=True)
#                 npy_path = os.path.join(sequence_dir, f'{frame_num}.npy')
#                 np.save(npy_path, keypoints)

#                 frame_num += 1
#             cap.release()

In [107]:
label_map = {label: num for num, label in enumerate(actions)}

In [108]:
label_map

{'bye': 0,
 'copy': 1,
 'disgust': 2,
 'eternity': 3,
 'hello': 4,
 'human': 5,
 'love': 6,
 'thanks': 7}

In [109]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

In [110]:
sequences, labels = [], []
for action in actions:
    for sequence in range(4):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])


In [111]:
np.array(sequences).shape

(32, 30, 1662)

In [112]:
X = np.array(sequences)

In [113]:
y = to_categorical(labels).astype(int)

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [115]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, RNN, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard

In [116]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [None]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))


  super().__init__(**kwargs)


In [125]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [126]:
model.summary()

In [127]:
model.fit(X_train, y_train, epochs=500, callbacks=[tb_callback])

Epoch 1/500


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - categorical_accuracy: 0.1071 - loss: 2.0849
Epoch 2/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - categorical_accuracy: 0.1786 - loss: 2.1684
Epoch 3/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - categorical_accuracy: 0.1071 - loss: 8.2103
Epoch 4/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - categorical_accuracy: 0.1071 - loss: 5.8980
Epoch 5/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - categorical_accuracy: 0.1071 - loss: 25.6444
Epoch 6/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - categorical_accuracy: 0.1429 - loss: 5.8439
Epoch 7/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - categorical_accuracy: 0.1071 - loss: 19.8496
Epoch 8/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - categorical_accu

<keras.src.callbacks.history.History at 0x76586c108350>

In [128]:
model.summary()

In [129]:
model.save('action2.h5')



In [130]:
model.save('action2.keras')