**Importing Dependencies**

In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os 
import tensorflow as tf

2024-07-17 17:12:37.650091: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-17 17:12:37.666339: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-17 17:12:37.670899: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-17 17:12:37.681617: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


**Keypoints using MP Holistic**

In [2]:
# Holistic and drawing class
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [3]:
# Hand detection
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    return image, results

In [4]:
# Drawing on frame
def draw_landmarks(image, results):
    # Define custom styles
    dot_color = (0, 255, 0)  # Green for landmarks
    connection_color = (255, 0, 0)  # Red for connections
    dot_radius = 5  # Size of the landmarks
    connection_thickness = 2  # Thickness of the connections

    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)

In [9]:
cap = cv2.VideoCapture(0)

# Instantiating object
holistic_model = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

while cap.isOpened():
    # Read Frame
    ret, frame = cap.read()

    # Make detections
    image, results = mediapipe_detection(frame, holistic_model)

    # Draw landmarks
    draw_landmarks(image, results)

    # Show to screen
    cv2.imshow('Video Capture', image)

    # Break gracefully
    if cv2.waitKey(10) & 0XFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

I0000 00:00:1721215264.210285   66002 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1721215264.213228   66274 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.1.3-manjaro1.1), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1721215264.308486   66259 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1721215264.332736   66263 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1721215264.334617   66262 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1721215264.334617   66268 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signatur

**Extracting Keypoints Values**

In [5]:
def extract_keypoints(results):
    # Pose has 33 points, left and right hand have 21 points

    pose = np.array([[res.x, res.y, res.z, res.visibility]
                     for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    left_hand = np.array([[res.x, res.y, res.z] 
                          for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    right_hand = np.array([[res.x, res.y, res.z] 
                           for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    return np.concatenate([pose, left_hand, right_hand])

**Setup Folders for Collection**

In [7]:
DATA_PATH = os.path.join('MP_Data')
actions = np.array(['hello', 'thumbs_up', 'thumbs_down', 'peace'])

# Thirty vidoes worth of data
no_sequence = 30
# Vidoes will be 30 frames in length
sequence_length = 30

In [28]:
for action in actions:
    for sequence in range(no_sequence):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

**Collect Keypoints Value for Training and Testing**

In [29]:
cap = cv2.VideoCapture(0)

for action in actions:
    for sequence in range(no_sequence):
        for frame_num in range(sequence_length):

            ret, frame = cap.read()

            image, results = mediapipe_detection(frame, holistic_model)
            draw_landmarks(image, results)

            # Applying collection logic
            if frame_num == 0:
                cv2.putText(image, 'Starting collection', (120,200),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0),4, cv2.LINE_AA)
                cv2.putText(image, 'Collecting image for {} Video number {}'.format(action, sequence+1), (15,12),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                cv2.waitKey(2000)
            else:
                cv2.putText(image, 'Collecting image for {} Video number {}'.format(action, sequence+1), (15,12),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                
            # Exporting keypoints
            keypoints = extract_keypoints(results)
            npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
            np.save(npy_path, keypoints)
            
            cv2.imshow('Video Capture', image)

            # break
            if cv2.waitKey(10) & 0XFF == ord('q'):
                break

cap.release()
cv2.destroyAllWindows()



**Preprocess Data and create Label Features**

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
label_map = {label:num for num, label in enumerate(actions)}
label_map

{'hello': 0, 'thumbs_up': 1, 'thumbs_down': 2, 'peace': 3}

In [10]:
sequences, labels = [], []

for action in actions:
    for sequence in range(no_sequence):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [11]:
np.array(sequences).shape

(120, 30, 258)

In [12]:
np.array(labels).shape

(120,)

In [13]:
X = np.array(sequences)
X

array([[[ 0.6198765 ,  0.65162748, -0.58890265, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.59742361,  0.64712679, -0.72822201, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.59767437,  0.64659399, -0.84026116, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.59111786,  0.63951457, -0.84893578, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.58782494,  0.6405884 , -0.84724158, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.58478832,  0.64103496, -0.66632855, ...,  0.08518297,
          0.66604555, -0.03015602]],

       [[ 0.58198464,  0.64252603, -0.68322998, ...,  0.1578989 ,
          0.57778209, -0.02647312],
        [ 0.57817966,  0.6461066 , -0.68392503, ...,  0.19311811,
          0.53886515, -0.00820633],
        [ 0.57640952,  0.64842302, -0.68213433, ...,  0.23435362,
          0.51180464, -0.02328515],
        ...,
        [ 0.59573913,  0.64977443, -0.63857746, ...,  

In [14]:
y = tf.keras.utils.to_categorical(labels).astype(int)
y

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1,

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [16]:
X_train.shape

(108, 30, 258)

In [17]:
X_test.shape

(12, 30, 258)

**Training The Neural Network**

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.callbacks import TensorBoard

In [19]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Go to the directory and in terminal type :
# tensorboard --logdir=.

In [20]:
model = Sequential()

model.add(Input(shape=(830, 25)))
model.add(LSTM(64, return_sequences=True, activation='relu'))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

I0000 00:00:1721216660.234722   80236 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1721216660.259608   80236 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1721216660.260030   80236 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1721216660.261674   80236 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [21]:
model.summary()

In [22]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [40]:
model.fit(tf.expand_dims(X_train, axis=-1), y_train, epochs=100, callbacks=[tb_callback])

Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - categorical_accuracy: 0.6631 - loss: 0.7892
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - categorical_accuracy: 0.6735 - loss: 0.7404
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - categorical_accuracy: 0.7242 - loss: 0.6991
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - categorical_accuracy: 0.7465 - loss: 0.6369
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - categorical_accuracy: 0.7200 - loss: 0.5936
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - categorical_accuracy: 0.7737 - loss: 0.6189
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - categorical_accuracy: 0.7432 - loss: 0.6639
Epoch 8/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - cate

<keras.src.callbacks.history.History at 0x7e703c26a380>

**Making Prediction**

In [41]:
y_pred = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


In [42]:
actions[np.argmax(y_pred[0])]

'thumbs_up'

In [43]:
actions[np.argmax(y_test[0])]

'thumbs_up'

**Saving Model**

In [44]:
model.save('model0.h5')



**Evaluation Model**

In [45]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [46]:
yhat = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


In [47]:
y_true = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [48]:
multilabel_confusion_matrix(y_true, yhat)

array([[[ 8,  0],
        [ 0,  4]],

       [[ 8,  0],
        [ 0,  4]],

       [[ 9,  0],
        [ 0,  3]],

       [[11,  0],
        [ 0,  1]]])

In [49]:
accuracy_score(y_true, yhat)

1.0

**Test in real time**

In [52]:
model = tf.keras.models.load_model('model0.h5')



In [53]:
sequence = []
sentence = []
threshold = 0.4


cap = cv2.VideoCapture(0)

# Instantiating object
holistic_model = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

while cap.isOpened():
    # Read Frame
    ret, frame = cap.read()

    # Make detections
    image, results = mediapipe_detection(frame, holistic_model)

    # Draw landmarks
    draw_landmarks(image, results)

    # Prediction logic
    keypoints = extract_keypoints(results)
    sequence.append(keypoints)
    sequence = sequence[-30:]

    if len(sequence) == 30:
        res = model.predict(np.expand_dims(sequence, axis=0))[0]
        print(actions[np.argmax(res)])

    # Visualize logic 
    if np.argmax(res) < len(actions):
        if res[np.argmax(res)] > threshold:
            if len(sentence) > 0:
                if actions[np.argmax(res)] != sentence[-1]:
                    sentence.append(actions[np.argmax(res)])
            else:
                sentence.append(actions[np.argmax(res)])
        
    if len(sentence) > 5:
        sentence = sentence[-5:]
    
    if sentence:
        cv2.rectangle(image, (0,0), (640,40), (245,117,16), -1)
        cv2.putText(image, sentence[-1], (3,30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2,cv2.LINE_AA)


    # Show to screen
    cv2.imshow('Video Capture', image)

    # Break gracefully
    if cv2.waitKey(10) & 0XFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

I0000 00:00:1721218084.608626   80236 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1721218084.609260  119755 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.1.3-manjaro1.1), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1721218084.660836  119741 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1721218084.677580  119749 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1721218084.679522  119741 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1721218084.679522  119744 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W00

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 669ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
thumbs_up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
thumbs_up
[1m1/1[0m [3

In [68]:
cap.release()
cv2.destroyAllWindows()