In [2]:
import os
import time
import numpy as np

import scipy.stats
import random

import matplotlib.pyplot as plt

from IPython.display import display, Image

import cv2
import mediapipe as mp
from sklearn.model_selection import train_test_split

from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import load_model


In [3]:
base_dir = "D:/SignBuddy"

test_videos_path = os.path.join(base_dir,'test_videos')

DATA_Path = os.path.join(base_dir,'MP_Data')

In [4]:
# Initialize the Holistic model from MediaPipe for full-body pose detection.
# This model can detect landmarks for the face, hands, and body.
mp_holistic = mp.solutions.holistic

# Initialize the drawing utilities from MediaPipe to draw the landmarks on images or videos.
# These utilities help visualize the pose detection results by drawing the keypoints and connections.
mp_drawing = mp.solutions.drawing_utils

# This function processes an image using a MediaPipe model to detect landmarks/poses and returns the processed image and the results.
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

# This function draws the detected pose, left hand, and right hand landmarks with their connections on the given image.
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

# This function draws the pose, left hand, and right hand landmarks with custom styling (color, thickness, and circle radius)
# on the given image, using the MediaPipe Holistic model's results.
def draw_styled_landmarks(image, results):
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    # Draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [5]:
# TODO: Change the name of the test video to ensure the code is running
video_name = "signsletters.MP4"
video_path = os.path.join(test_videos_path, video_name)

# Try to open the video
cap = cv2.VideoCapture(video_path)

# Check if the video opened successfully
if not cap.isOpened():
    raise FileNotFoundError(f"Error: Cannot open video file '{video_path}'. Please check if the file exists and the path is correct.")

# Set MediaPipe model
mp_holistic = mp.solutions.holistic
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        frame = cv2.flip(frame, 0)

        if not ret:
            break

        # Make detections
        image, results = mediapipe_detection(frame, holistic)

        # # Draw landmarks on the frame
        # draw_styled_landmarks(image, results)

        # # Convert frame to RGB (to display in Colab)
        # image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # # Display the frame
        # plt.figure(figsize=(10, 10))
        # plt.imshow(image_rgb)
        # plt.axis('off')
        # display(plt.gcf())
        # plt.close()

        # Pause for a short moment to simulate frame rate
        cv2.waitKey(1)

    cap.release()

In [6]:
# Initialize an empty list to store pose landmark data
pose = []

# Loop through each landmark in the detected pose landmarks
for res in results.pose_landmarks.landmark:
    # Create a numpy array with the x, y, z coordinates and visibility of each landmark
    test = np.array([res.x, res.y, res.z, res.visibility])

    # Append the numpy array for the current landmark to the 'pose' list
    pose.append(test)

# Create a numpy array for pose landmarks, flattening the x, y, z coordinates and visibility for each landmark.
# If pose landmarks are not detected, return a zero array of length 132 (assuming 33 landmarks, each with 4 values: x, y, z, visibility).
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)

# Create a numpy array for face landmarks, flattening the x, y, z coordinates for each landmark.
# If face landmarks are not detected, return a zero array of length 1404 (assuming 468 landmarks, each with 3 values: x, y, z).
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)

# Create a numpy array for left hand landmarks, flattening the x, y, z coordinates for each landmark.
# If left hand landmarks are not detected, return a zero array of length 63 (21 landmarks, each with 3 values: x, y, z).
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)

# Create a numpy array for right hand landmarks, flattening the x, y, z coordinates for each landmark.
# If right hand landmarks are not detected, return a zero array of length 63 (21 landmarks, each with 3 values: x, y, z).
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

# This function extracts keypoints (pose, face, left hand, right hand) from the results of MediaPipe landmarks,
# flattens the data into 1D arrays, and concatenates them into a single array for further processing.
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [7]:
# TODO: Define the set of actions (sign language gestures) to detect
actions = np.array(['a', 'b', 'c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])

# Path for exported data (numpy arrays will be saved here)
DATA_PATH = os.path.join(base_dir, 'MP_Data')
VIDEO_PATH = os.path.join(base_dir, 'videos')

# Create the data directory if it doesn't exist
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

# Number of sequences (videos) to record per action
no_sequences = 120

# Number of frames per video sequence
sequence_length = 30

# Starting index for naming folders that store each sequence
start_folder = 30

# Create folder structure for each action and each sequence
for action in actions:
    # Path for this action's data
    action_path = os.path.join(DATA_PATH, action)
    video_path = os.path.join(VIDEO_PATH, action)

    # Create action folder if it doesn't exist
    if not os.path.exists(action_path):
        os.makedirs(action_path)

    # Create video folder if it doesn't exist
    if not os.path.exists(video_path):
        os.makedirs(video_path)

    # Find existing numbered folders (sequences) for this action
    existing_dirs = [d for d in os.listdir(action_path) if d.isdigit()]

    # Determine the current max sequence number to avoid overwriting
    if existing_dirs:
        dirmax = np.max(np.array(existing_dirs).astype(int))
    else:
        dirmax = 0

    # Create subfolders for each new sequence
    for sequence in range(0, no_sequences):
        seq_path = os.path.join(action_path, str(dirmax + sequence))
        if not os.path.exists(seq_path):
            os.makedirs(seq_path)