In [None]:
#Plan

# Using media pipe holistic to extract keypoints
# Using Tensorflow and Keras to build an LSTM model

# Collecting data from hand, body, and face and saving them as numpy arrays

# Train deep neural network with LSTM layers to detect sequences

# Perform real time sign language using OpenCV

In [None]:
#1 Installing Dependencies

%pip install tensorflow OpenCV-python mediapipe scikit-learn matplotlib numpy
%pip install sklearn

In [9]:
# imports

import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import mediapipe as mp

2. Detecting keypoints using mediapipe holistic

In [10]:
mp_holistic = mp.solutions.holistic # The holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [11]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False                  
    results = model.process(image)    
    image.flags.writeable = True             
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

#open cv uses BGR instead of RGB
# mediapipe uses RGB

In [12]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

In [13]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [14]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    left_hand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    right_hand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, left_hand, right_hand])

Set Up Folders for data collection

In [None]:
DATA_PATH = os.path.join('MP_Data')
print(DATA_PATH)

#Actions that we try to detect was supplied with numbers for our dataset, can be given names for training custom signs
actions = np.array([str(i) for i in range(1, 66)])

print(actions)


# Number of videos to collect
no_sequences = 10  # Number of videos to collect
video_num_length = 5 # Variations of the sign

# Number of frames
# sequence_length = 30 
# Folder start
start_folder = 0


In [None]:
for action in actions:
    #print(os.path.exists(DATA_PATH))
    #print(action)
    action_path = os.path.join(DATA_PATH, str(action))
    print(action_path)
    #print(action_path)

    # Check if the directory exists
    if not os.path.exists(action_path):
        # If it doesn't exist, create it
        os.makedirs(action_path)
        dirmax = 0
        
    else:
        # If it exists, find the maximum directory number
        #print(os.listdir(action_path))
        #print(np.array(os.listdir(action_path)).astype(int))
        dirmax = np.max(np.array(os.listdir(action_path)).astype(int))
    
    for sequence in range(1, no_sequences+1):
        for video_num in range(1, video_num_length+1):
            try: 
                os.makedirs(os.path.join(action_path, str(dirmax+sequence)+str(video_num)))
            except:
                pass

Collecting Keypoints for Training and Testing

In [None]:
# Method to train the data using a live feed, allows creating a custom dataset using a live stream



# cap = cv2.VideoCapture(0) # Accessing video capture device 0 (usually the webcam)
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
#     for action in actions:

#         #Loop through videos
#           for sequence in range(start_folder, start_folder+no_sequences):
#             for frame_num in range(sequence_length):
#                  # Reading frames
#                 ret, frame = cap.read()

#                 # Using mediapipe to detect landmarks
#                 image, results = mediapipe_detection(frame, holistic)

#                 # Draw landmarks 
#                 draw_styled_landmarks(image, results)

#                 # Apply wait logic
#                 if frame_num == 0:
#                     cv2.putText(image, 'STARTING COLLECTION', (120,200), 
#                                 cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
#                     cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
#                                 cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
#                     cv2.imshow('OpenCV Feed', image)
#                     cv2.waitKey(2000)
#                 else:
#                     cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
#                                 cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
#                     cv2.imshow('OpenCV Feed', image)
       
#                 keypoints = extract_keypoints(results)
#                 npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
#                 np.save(npy_path, keypoints)
              
#                 if cv2.waitKey(10) & 0xFF == ord('q'):
#                     break
# cap.release()
# cv2.destroyAllWindows()

In [None]:
import cv2
import os
import numpy as np
import mediapipe as mp

# Function to process a single video
def process_video(video_path, action, sequence,video_num):
    cap = cv2.VideoCapture(video_path)

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print("TOTAL FRAMES: "+str(total_frames))
    frame_num = 0
    frame_indices = np.linspace(0, total_frames-1, num=30, dtype=int)
    print(frame_indices)

    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()

            frame = cv2.resize(frame, (640, 480))

            image, results = mediapipe_detection(frame, holistic)

            draw_styled_landmarks(image, results)

            cv2.imshow('Mediapipe Feed', image)
            cv2.waitKey(1)  # Wait for 1 millisecond

            keypoints = extract_keypoints(results)
            npy_path = os.path.join(DATA_PATH, action, str(sequence)+str(video_num),str(frame_num))
            print(npy_path,keypoints)
            np.save(npy_path, keypoints)
                
            frame_num += 1
            print(action)
            print("sequ:"+sequence)

    cap.release()


dataset_path = os.path.join(r'lsa64_raw\all_cut') 
print("dataset_path: ", dataset_path)  #all_cut
# Main loop to process all videos in the dataset

counter =0
for video_file in os.listdir(dataset_path):
    if(counter%100==0):
        print("counter: ", counter)
    if video_file.endswith(".mp4"):  # Assuming all video files have .mp4 extension
    # Extract action, sequence, and video number from the video file name
        action = int(video_file[:3])
        #sequence = (video_file[4:11])
        sequence = int(video_file[4:7])
        video_num = int(video_file[8:11])
    # if (action == 9):
    #     break
    # if(action<5):
    #     continue
    video_path = os.path.join(dataset_path, video_file)
    #process_video(video_path, action, sequence, video_num)
    process_video(video_path, str(action), str(sequence),str(video_num))

cv2.destroyAllWindows()

In [23]:
label_map = {label:num for num, label in enumerate(actions)}

In [None]:
counter = 0
max_c = 0 
sequences, labels = [], []
for action in actions:
    for sequence in range(1,10+1): #4 sequences
        for video_num in range(1,video_num_length+1): #50 per sequence
            window = []
            frame_dir = os.path.join(DATA_PATH, action, str(sequence)+str(video_num))
            frame_files = sorted(os.listdir(frame_dir), key=lambda x: int(os.path.splitext(x)[0]))  # Sort the frames
            for frame_file in frame_files:  # Process the frames in order
                print(os.path.join(frame_dir, frame_file))
                res = np.load(os.path.join(frame_dir, frame_file))
                print(res)
                window.append(res)
            
            if(len(window) == 0):
                break
            sequences.append(window)
            labels.append(label_map[str(action)])
            counter+=1

print(len(labels)) 
print(len(sequences))
print("Counter"+str(counter))
print("max counter"+str(max_c))

In [None]:
np.save('sequences.npy', sequences) # save the sequences for later use
np.save('labels.npy', labels) # save the labels for later use

In [None]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

X = np.array(sequences)
print('--------------')
X.shape
print(X.shape)

# Convert the integer labels to one-hot encoding
y = to_categorical(labels).astype(int)

print(y.shape)
print(y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)    # Split the data into training and testing sets

Feature Extraction with Data Augmentation

The below feature extraction approach augments the data by applying a set of translation, rotation, blurring, brightness variations and scaling to prevent overfitting on the dataset.

In [22]:
import os
import numpy as np

DATA_PATH = os.path.join('augmented_Data')
print(DATA_PATH)
#Actions that we try to detect
actions = np.array([str(i) for i in range(1, 66)])


print(actions)
# Number of videos to collect
no_sequences = 10 
video_num_length = 5

# Number of frames
#sequence_length = 30 
# Folder start
start_folder = 0


augmented_Data
['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15' '16'
 '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27' '28' '29' '30'
 '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41' '42' '43' '44'
 '45' '46' '47' '48' '49' '50' '51' '52' '53' '54' '55' '56' '57' '58'
 '59' '60' '61' '62' '63' '64' '65']


In [None]:
for action in actions:
    action_path = os.path.join(DATA_PATH, str(action))
    print(action_path)

    if not os.path.exists(action_path):
        os.makedirs(action_path)
        dirmax = 0
    else:
        dirmax = np.max(np.array(os.listdir(action_path)).astype(int))
    
    for sequence in range(1, no_sequences+1):
        for video_num in range(1, video_num_length+1):
            video_dir = os.path.join(action_path, str(dirmax+sequence)+str(video_num))
            try: 
                os.makedirs(video_dir)
                os.makedirs(os.path.join(video_dir, 'original'))
                os.makedirs(os.path.join(video_dir, 'augmented'))
            except:
                pass

In [None]:
import cv2
import os
import numpy as np
import mediapipe as mp

# Function to process a single video
from imgaug import augmenters as iaa

# Define the augmentation pipeline
seq = iaa.Sequential([
    iaa.Affine(rotate=(-11, 11)), # rotate the image
    iaa.Affine(translate_percent={"x": (-0.12, 0.12), "y": (-0.12, 0.12)}), # translate the image
    iaa.Affine(scale=(1, 1.22)), # zoom in or out on the image
    iaa.Multiply((0.8, 1.2)), # vary brightness
    iaa.GaussianBlur(sigma=(0, 0.5)), # apply gaussian blur
    # add more augmenters as needed
])

# Function to process a single video
import shutil

def process_video(video_path, action, sequence, video_num):
    cap = cv2.VideoCapture(video_path)

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print("TOTAL FRAMES: "+str(total_frames))
    frame_num = 0
    frame_indices = np.linspace(0, total_frames-1, num=30, dtype=int)
    print(frame_indices)

    # Get a deterministic augmenter
    seq_det = seq.to_deterministic()

    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()

            frame = cv2.resize(frame, (640, 480))

            # Apply the deterministic augmentation
            frame = seq_det.augment_image(frame)

            # Process the augmented frame
            image, results = mediapipe_detection(frame, holistic)

            draw_styled_landmarks(image, results)

            cv2.imshow('Mediapipe Feed', image)
            cv2.waitKey(1)  # Wait for 1 millisecond

            # Copy the keypoints from the original frames
            original_npy_path = os.path.join('MP_Data', action, str(sequence)+str(video_num), str(frame_num)+'.npy')
            new_npy_path = os.path.join(DATA_PATH, action, str(sequence)+str(video_num), 'original', str(frame_num)+'.npy')
            shutil.copyfile(original_npy_path, new_npy_path)

            keypoints = extract_keypoints(results)
            npy_path = os.path.join(DATA_PATH, action, str(sequence)+str(video_num), 'augmented', str(frame_num))
            print(npy_path,keypoints)
            np.save(npy_path, keypoints)
                
            frame_num += 1
            print(action)
            print("sequ:"+sequence)

    cap.release()


dataset_path = os.path.join(r'lsa64_raw\all_cut') 
print("dataset_path: ", dataset_path)  #all_cut
# Main loop to process all videos in the dataset


counter =0
for video_file in os.listdir(dataset_path):
    if(counter%100==0):
        print("counter: ", counter)
    if video_file.endswith(".mp4"):  # Assuming all video files have .mp4 extension
    # Extract action, sequence, and video number from the video file name
        action = int(video_file[:3])
        #sequence = (video_file[4:11])
        sequence = int(video_file[4:7])
        video_num = int(video_file[8:11])
    # if(action<5):
    #     continue
    video_path = os.path.join(dataset_path, video_file)
    #process_video(video_path, action, sequence, video_num)
    process_video(video_path, str(action), str(sequence),str(video_num))

cv2.destroyAllWindows()

In [20]:
label_map = {label:num for num, label in enumerate(actions)}

In [None]:
#Extracting X, y from the augmented data and original data


counter = 0
max_c = 0 

sequences_augmented, labels_augmented = [], []
for action in actions:
    for sequence in range(1,10+1): #4 sequences
        for video_num in range(1,video_num_length+1): #50 per sequence
            # Load original frames
            window = []
            frame_dir = os.path.join(DATA_PATH, action, str(sequence)+str(video_num), 'original')
            frame_files = sorted(os.listdir(frame_dir), key=lambda x: int(os.path.splitext(x)[0]))  # Sort the frames
            for frame_file in frame_files:  # Process the frames in order
                print(os.path.join(frame_dir, frame_file))
                res = np.load(os.path.join(frame_dir, frame_file))
                print(res)
                window.append(res)
            if(len(window) > 0):
                sequences_augmented.append(window)
                labels_augmented.append(label_map[str(action)])
                counter+=1

            # Load augmented frames
            window = []
            frame_dir = os.path.join(DATA_PATH, action, str(sequence)+str(video_num), 'augmented')
            frame_files = sorted(os.listdir(frame_dir), key=lambda x: int(os.path.splitext(x)[0]))  # Sort the frames
            for frame_file in frame_files:  # Process the frames in order
                print(os.path.join(frame_dir, frame_file))
                res = np.load(os.path.join(frame_dir, frame_file))
                print(res)
                window.append(res)
            if(len(window) > 0):
                sequences_augmented.append(window)
                labels_augmented.append(label_map[str(action)])
                counter+=1


print(len(labels_augmented)) 
print(len(sequences_augmented))
print("Counter"+str(counter))
print("max counter"+str(max_c))

In [None]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

X = np.array(sequences_augmented)
print('--------------')
X.shape
print(X.shape)

# Convert the integer labels to one-hot encoding
y = to_categorical(labels_augmented).astype(int)

print(y.shape)
print(y)

In [None]:
np.save('sequences_augmented.npy', sequences_augmented) # save the sequences for later use
np.save('labels_augmented.npy', labels_augmented) # save the labels for later use

Testing 

In [32]:
from keras.models import load_model

# Load the model from a .h5 file
model = load_model('actions/actions_cnn.h5')



Testing with continuous stream 

In [33]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [25]:
sequence = [] # collect frames to make our predictions
sentence = [] # store the sentence that we are going to predict
threshold = 0.5 # threshold for prediction (only render results above this threshold)
predictions = [] # This will prevent detecting signs by mistake as we transition between signs

cap = cv2.VideoCapture(0) # Accessing video capture device 0 (usually the webcam)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        #Loop through videos
            ret, frame = cap.read()

            frame = cv2.resize(frame, (640, 480))

            # Using mediapipe to detect landmarks
            image, results = mediapipe_detection(frame, holistic)

            # Draw landmarks 
            draw_styled_landmarks(image, results)

            # 2. Prediction logic
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-30:] # store the last 30 frames
            # sequence_2d = np.reshape(sequence, (1, seq_length * num_features))

            if len(sequence) == 30:
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                # res = grid.predict(np.expand_dims(sequence, axis=0))[0]
                predictions.append(np.argmax(res))
                print(actions[np.argmax(res)])

            # 3. Vizualization logic

            # Render the predictions
                if np.unique(predictions[-10:])[0]==np.argmax(res):
                    if res[np.argmax(res)] > threshold:
                        if len(sentence) > 0: 
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                        else:
                            sentence.append(actions[np.argmax(res)])
                if len(sentence) > 5:
                    sentence = sentence[-5:]

            cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
            cv2.putText(image, ' '.join(sentence), (3,30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
            cv2.imshow('OpenCV Feed', image) 
              
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
cap.release()
cv2.destroyAllWindows()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
15
[1m1/1[0m [32m━━━━━━━━

Testing with video Input

In [None]:
import cv2
import numpy as np

# Create a VideoCapture object and specify video file to read
cap = cv2.VideoCapture(r'./all_cut/001_001_001.mp4')

# Check if video opened successfully
if not cap.isOpened():
    print("Error opening video file")

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_indices = np.linspace(0, total_frames-1, num=30, dtype=int)

sequence = [] # Initialize sequence list to store frames

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()

        # Check if frame was successfully read
        if not ret:
            print(f"Failed to read frame at index {idx}")
            continue

        frame = cv2.resize(frame, (640, 480))

        # Using mediapipe to detect landmarks
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks 
        draw_styled_landmarks(image, results)

        # Extract keypoints
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)

        # Display the resulting frame
        cv2.imshow('Frame', image)

        # Press Q on keyboard to exit
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break

    # Make prediction on the captured frames
    res = model.predict(np.expand_dims(sequence, axis=0))[0]
    print(f'Predicted action: {actions[np.argmax(res)]}')

# Release the video capture object
cap.release()

# Close all OpenCV windows
cv2.destroyAllWindows()

Testing with 2s livefeed stream to accurately retrieve 30 frames

In [34]:
import cv2
import numpy as np
import time

stop = False
cap = cv2.VideoCapture(0) # Accessing video capture device 0 (usually the webcam)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while not stop and cap.isOpened():
        time.sleep(3) # Wait for 3 seconds before starting to capture frames

        total_frames = int(2 * cap.get(cv2.CAP_PROP_FPS)) # Total frames to capture in 2 seconds
        frame_indices = np.linspace(0, total_frames - 1, num=30, dtype=int) # Get 30 evenly spaced frame indices

        sequence = [] # Initialize sequence list to store frames

        for i in range(total_frames): # Capture frames for 2 seconds
            # Reading frames
            ret, frame = cap.read()

            if i in frame_indices: # Only process the frame if it's one of the 30 frames we want
                # Resize the frame
                frame = cv2.resize(frame, (640, 480))

                # Using mediapipe to detect landmarks
                image, results = mediapipe_detection(frame, holistic)

                # Draw landmarks 
                draw_styled_landmarks(image, results)

                # Extract keypoints
                keypoints = extract_keypoints(results)
                sequence.append(keypoints)

                # Show image
                cv2.imshow('OpenCV Feed', image)

            if cv2.waitKey(10) & 0xFF == ord('q'):
                break

        # Make prediction on the captured frames
        res = model.predict(np.expand_dims(sequence, axis=0))[0]
        print(f'Predicted action: {actions[np.argmax(res)]}')

        while True:
            if cv2.waitKey(10) & 0xFF == ord('q'):
                stop = True
                break
                print('q pressed')
            if cv2.waitKey(10) & 0xFF == ord('a'):
                break

cap.release()
cv2.destroyAllWindows()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 530ms/step
Predicted action: 23
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Predicted action: 14
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Predicted action: 52
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Predicted action: 21
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Predicted action: 26
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Predicted action: 3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Predicted action: 2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Predicted action: 27
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Predicted action: 52
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Predicted action: 33
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step

In [36]:
cap.release()
cv2.destroyAllWindows()