In [1]:
import numpy as np
import cv2
import time
import os
from matplotlib import pyplot as plt
import mediapipe as mp

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable= False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [4]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, 
                              mp_drawing.DrawingSpec(color=(128, 0, 0), thickness=1, circle_radius=1), 
                              mp_drawing.DrawingSpec(color=(192, 192, 192), thickness=1, circle_radius=1))
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [209]:
# Open the webcam
cap = cv2.VideoCapture(0)

# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
    
        # Read the feed
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            break
    
        # Make detections 
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_landmarks(image, results)
        
    
        # Show the frame on the screen
        cv2.imshow("OpenCV Feed", image)
    
        # Check if 'q' key is pressed to exit
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    
    # Release the webcam and close windows
    cap.release()
    cv2.destroyAllWindows()

<font size="5.5">Extract Keypoint Values</font>

In [10]:
print(len(results.face_landmarks.landmark))
print(len(results.pose_landmarks.landmark))
print(len(results.left_hand_landmarks.landmark))

468
33
21


In [5]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*3)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    left_hand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    right_hand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, left_hand, right_hand])

In [12]:
print(extract_keypoints(results).shape)
print(33*3 + 468*3 + 21*3 + 21*3)

(1629,)
1629


<font size="5">Setup Folders for Collection</font>



In [6]:
# Path for extracted data, numpy array
DATA_PATH = os.path.join("MP_data")

# Actions that we try to detect
actions = np.array(["hello", "thanks", "iloveyou"])

# 30 videos worth of data 
num_sequences = 50

# Videos are going to be 30 frames in length
sequence_length = 30

In [7]:
# Create directories

for action in actions:
    for sequence in range(num_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

<font size="5.5">Collecting Dataset</font>


In [8]:
# Open the webcam
cap = cv2.VideoCapture(0)

# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:

    for action in actions:
        for sequence in range(num_sequences):
            for frame_num in range(sequence_length):

                # Read Feed
                ret, frame = cap.read()

                # Made Detections
                image, results = mediapipe_detection(frame, holistic)

                # Draw Landmarks
                draw_landmarks(image, results)

                # Wait Logic
                if frame_num == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(1000)
                else: 
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    
                # Export Keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                # Show the frame on the screen
                cv2.imshow("OpenCV Feed", image)
            
                # Check if 'q' key is pressed to exit
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

    # Release the webcam and close windows
    cap.release()
    cv2.destroyAllWindows()



<font size="5.5">Preprocess Data and Create Labels</font>

In [7]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [8]:
label_map = {label:num for num, label in enumerate(actions)}
sequences, labels = [], []

for action in actions:
    for sequence in range(num_sequences):
        window = []
        for frame_num in range(sequence_length):
            window.append(np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num))))
        sequences.append(window)
        labels.append(label_map[action])

In [9]:
X = np.array(sequences)
y = to_categorical(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(127, 30, 1629) (23, 30, 1629) (127, 3) (23, 3)


<font size="5.5">Build and Train LSTM Neural Network</font>


In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [14]:
log_dir = os.path.join("Logs")
tb_callback = TensorBoard(log_dir=log_dir)

In [30]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation="relu", input_shape=(30,1629)))
model.add(LSTM(128, return_sequences=True, activation="relu"))
model.add(LSTM(64, return_sequences=False, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(actions.shape[0], activation="softmax"))

model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["categorical_accuracy"])
model.summary()

In [31]:
model.fit(X_train, y_train, epochs=300, callbacks=[tb_callback])

Epoch 1/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - categorical_accuracy: 0.3083 - loss: 1.3729
Epoch 2/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - categorical_accuracy: 0.3481 - loss: 1.2681
Epoch 3/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - categorical_accuracy: 0.3824 - loss: 1.3576
Epoch 4/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - categorical_accuracy: 0.4428 - loss: 1.0420
Epoch 5/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - categorical_accuracy: 0.6624 - loss: 0.8979
Epoch 6/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - categorical_accuracy: 0.3700 - loss: 1.3667
Epoch 7/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - categorical_accuracy: 0.5514 - loss: 0.8834
Epoch 8/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - cate

<keras.src.callbacks.history.History at 0x275c1a436a0>

In [42]:
model.save('model_v1_ep300.keras')

In [43]:
model.load_weights('model_v1_ep300.keras')

<font size="5.5">Evaluation using Confusion Matrix and Accuracy</font>

In [44]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [45]:
yhat = model.predict(X_test)

yhat = np.argmax(yhat, axis=1).tolist()
ytrue = np.argmax(y_test, axis=1).tolist()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


In [46]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[19,  0],
        [ 0,  4]],

       [[13,  0],
        [ 0, 10]],

       [[14,  0],
        [ 0,  9]]], dtype=int64)

In [47]:
accuracy_score(ytrue, yhat)

1.0

<font size="5.5">Test in Real Time</font>

In [48]:
colors = [(245,117,16), (117,245,16), (16,117,245)]

def prob_visualization(res, actions, frame, colors):
    for num, prob in enumerate(res):
        cv2.rectangle(frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
    
    return frame

In [53]:
sequence = []
sentence = []
predictions = []
threshold = 0.7
cap = cv2.VideoCapture(0)

# Set Mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Reed feed
        ret, frame = cap.read()

        # Make datections
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        #draw_landmarks(image, results)

        # Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))

            # Visualize logic
            if np.unique(predictions[-10:])[0]==np.argmax(res):
                if res[np.argmax(res)] > threshold:
                    if len(sentence) > 0:
                        current_action = actions[np.argmax(res)]
                        if current_action != sentence[-1]:
                            sentence.append(current_action)
                    else:
                        sentence.append(actions[np.argmax(res)])
    
            if len(sentence) > 5:
                sentence = sentence[-5:]
    
            # Visualize probabilities
            image = prob_visualization(res, actions, frame, colors)
    
        cv2.rectangle(image, (0,0), (640,40), (245, 177, 16), -1)
        cv2.putText(image, " ".join(sentence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
            
        # Show to the screen 
        cv2.imshow("OpenCV Feed", image)

        # Check if 'q' key is pressed to exit
        if cv2.waitKey(10) & 0xFF == ord("q"):
            break

    # Release the webcam and close windows
    cap.release()
    cv2.destroyAllWindows()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16m

In [52]:
!pip3 freeze > requirements.txt