In [None]:
'''
Originally based off of the 'Sign Language Detection Using Action Recognition' Python tutorial by Nicholas Renotte
Tutorial Video Link: 'https://www.youtube.com/watch?v=doDUihpj6ro'
'''

In [None]:
# Installs Dependencies

# Allows for the easier visualization of images
#!pip3 install matplotlib

# Enables for the extraction of keypoints from images
#!pip3 install mediapipe

# Computer vision library which can work with computer webcams
#!pip3 install opencv-python

# Primarily for dataset splitting in this scenario
#!pip3 install sklearn

#!pip3 install tensorflow
#!pip3 install tensorflow-gpu

In [None]:
# Imports Dependencies
import cv2
import mediapipe as mp
import numpy as np
import os
import time

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix

from sklearn.model_selection import train_test_split

from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

In [None]:
# Keypoints Using Media Pipe Holistic

# Brings in a pre-existing mediapipe holisitc model that actually carries out detection
mp_holistic = mp.solutions.holistic

# Drawing utilities used by the mediapipe holisitc model to draw out detections from an image back onto it
mp_drawing = mp.solutions.drawing_utils

In [None]:
# Takes in modelized landmark data and draws it back onto an image
def draw_landmarks(image, results):
    
    # Draws facial connections
    mp_drawing.draw_landmarks(image, 
                              results.face_landmarks , 
                              mp_holistic.FACE_CONNECTIONS, 
                              
                              # Joints
                              mp_drawing.DrawingSpec(color = (80, 110, 10), 
                                                     thickness = 1, 
                                                     circle_radius = 1), 
                              
                              # Lines
                              mp_drawing.DrawingSpec(color = (80, 256, 121), 
                                                     thickness = 1, 
                                                     circle_radius = 1))
    
    # Draws overall body pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks , mp_holistic.POSE_CONNECTIONS)
    
    # Draws left hand connection
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks , mp_holistic.HAND_CONNECTIONS)
    
    # Draws right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks , mp_holistic.HAND_CONNECTIONS)

In [None]:
# Analyzes an image and returns modelized landmarks of said image
def mediapipe_detection(image, model):
    
    # Converts the image
    
    # Color conversion from Blue, Green, Red to Red, Green, Blue
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Saves on memory by no longer making the image writeable
    image.flags.writeable = False
    
    # Carries out a prediction upon the image with the given pre-existing model
    results = model.process(image)
    
    # Unconverts the image
    
    # Renders the image writeable once more
    image.flags.writeable = True
    
    # Reconverts the image back to its original color scheme
    image.cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    # Returns the image along with the given model predictions
    return image, results

In [None]:
'''
Has CV2 continously take images via the given camera and looping through them, therefore appearing
    as if presenting live video, before halting the process upon hitting the 'q' key upon the keyboard
'''
image_capture = cv2.VideoCapture(0)

# Sets the mediapipe model
with mp_holistic.Holistic(min_detection_confidence = 0.5, 
                          min_tracking_confidence = 0.5) as holistic:
    
    while image_capture.isOpened():

        # Reeds in the frame from the given camera
        result, frame = image_capture.read()

        # Carries out keypoint detection upon the current frame
        image, results = mediapipe_detection(frame, holistic)
        
            
        # Draws keypoint landmarks
        draw_landmarks(image, results)

        # Dispalys the current frame to the user via a GUI window
        cv2.imshow('OpenCV Video Camera Feed', image)

        # Breaks out of the loop gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    
image_capture.realease()
cv2.destroyAllWindows()

In [None]:
# Extracts Keypoint Values

'''
Each landmark is an array of coordinates, varying in terms of how made types depending upon the sort
    of landmark, so this and the fact that the model may fail to detect them for whatever reason, such
    as a hand out of frame, must be taken into consideration when recording these values, along with
    the fact that they must be flattened into a single 1 dimensional array for their purpose as input
    into a neural network later on
Essentially concatenates all keypoints into a single, flat numpy array for sign language detection learning, 
    the neural network decoding these values to learn what is what
'''
def extract_keypoints(results):
    pose = np.array([[result.x, result.y, result.z, result.visibility] for result in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33 * 4)
    face = np.array([[result.x, result.y, result.z] for result in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468 * 3)
    left_hand = np.array([[result.x, result.y, result.z] for result in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    right_hand = np.array([[result.x, result.y, result.z] for result in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    
    return np.concatenate([pose, face, left_hand, right_hand])

In [None]:
(468 * 3) + (33 * 4) + (21 * 3) + (21 * 3)

In [None]:
# Sets Up Folders For Collection

# File and directory path for exported numpy array data
DATA_PATH = os.path.join('../../Data/SignLanguageRecognition')

# Actions to be detected
actions = np.array(['hello', 'thanks', 'i_love_you'])

'''
30 frames of sequential keypoints used in order to attempt to classify an action 
    via genuine action detection rather than through the use of a single frame
    in terms of computer vision
'''

# 30 Videos worth of data
number_of_sequences = 30

# Each video is 30 frames in length
sequence_length = 30

In [None]:
# Saves each of the images extracted keypoint data as a file in action delemited folders
for action in actions:
    for sequence in range(number_of_sequences):
        try:
            os.makedirs(os.path(DATA_PATH, action, str(sequence)))
        except:
            pass

In [None]:
# Collects Keypoint Values For Training & Testing

image_capture = cv2.VideoCapture(0)

# Sets the mediapipe model
with mp_holistic.Holistic(min_detection_confidence = 0.5, 
                          min_tracking_confidence = 0.5) as holistic:
    
    # Loops through the available actions
    for action in actions:
        
        # Loops through the available videos or sequences
        for sequence in range(number_of_sequences):
            
            # Loops throug the video's entire length, set as the sequence length or number of frames
            for frame_number in range(sequence_length):
                
                result, frame = image_capture.read()

                image, results = mediapipe_detection(frame, holistic)

                draw_landmarks(image, results)
                
                '''
                Applies collection logic
                Collection breaks used between each sequence in order to allow time to reset and 
                    reposition in order to properly collect each action from stat to finish
                '''
                if frame_number == 0:
                    
                    '''
                    Variables include the image, text, text postion, font type, font size, front color, 
                        line thickness and line size
                    '''
                    cv2.putText(image, 'STARTING COLLECTION', (120, 200), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                    
                    cv2.putText(image, f'Collecting frames for {action} Video Number {sequence}', (15, 12), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    
                    cv2.imshow('OpenCV Video Camera Feed', image)
                    
                    # For each video a 2 second break will occur upon the first frame of a new video
                    cv2.waitKey(2000)
                
                else:
                    cv2.putText(image, f'Collecting frames for {action} Video Number {sequence}', (15, 12), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    
                    cv2.imshow('OpenCV Video Camera Feed', image)

                # Exports the new keypoints
                keypoints = extract_keypoints(results)
                numpy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_number))
                np.save(numpy_path, keypoints)

                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
    
    image_capture.realease()
    cv2.destroyAllWindows()

In [None]:
# Preporcesses Data & Creates Labels & Features

'''
Creates a dictionary where the key is an action and its value is its index number from the
    orginal action numpy array, which will then be used to label the saved data for supervised
    learning purposes
'''
label_map = {label:number for number, label in enumerate(actions)}

In [None]:
print(label_map)

In [None]:
'''
A single giant array of values containing all of the data is desired
End up with 90 arrays of videos, each video with 30 frames, each frame having 1662 values representing
    the flatten keypoints for the face, pose and left and right hand coordinates
'''

sequences, labels = [], []

for action in actions:
    for squence in range(number_of_sequences):
        window = []
        
        for frame_number in range(sequence_length):
            result = np.load()

In [None]:
print(np.array(labels).shape)

In [None]:
x = np.array(sequences)

In [None]:
print(x.shape)

In [None]:
y = to_categorical(labels).astype(int)

In [None]:
x_training, x_testing, y_training, y_testing = train_test_split(x, y, test_size = 0.05)

In [None]:
# Builds & Trains LSTM Neural Network

logs_directory = os.path.join('Logs')
tensorboard_callback = TensorBoard(log_dir = logs_directory)

In [None]:
'''
Many professional neural networks use CNN layers or pre-trained models followed by LSTM layers, 
    the reasons being that they require less data to produce fairly accurate results, are faster
    to train due to being far less dense in terms of the added layers and therefore connections
    between the available neurons, and because of the simpler neural network, also faster when
    carrying out action detection in real time
'''

model = Sequential()

'''
When utilizing LSTM layers, you must return the sequences if the following stacked layer is also an LSTM type
The input shape is of 30 image frames per prediction, with each frame having 1662 keypoint values
'''
model.add(LSTM(64, return_sequences = True, activation = 'relu', input_shape = (30, 1662)))
model.add(LSTM(128, return_sequences = True, activation = 'relu'))
model.add(LSTM(64, return_sequences = False, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))

'''
Return output of the model are values within a probability of 0 to 1, 
    with the sum of all the values added together adding up to 1, taking
    the maximum of those values are the final answer
    Example output: [0.7, 0.2, 0.1] each value being for each action available
'''
model.add(Dense(actions.shape[0], activation = 'softmax'))

In [None]:
example_result = [0.7, 0.2, 0.1]
print(actions[np.argmax(example_result)])

In [None]:
'''
For multiclass classification models, you must use the categorical crossentropy loss optimizer
For binary classification models, binary crossentropy loss optimize is preferred
'''
model.compile(optimizer = 'Adam', 
              loss = 'categorical_crossentropy', 
              metrics = ['categorical_accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(x_training, y_training, epochs = 200, callbacks = [tensorboard_callback])

In [None]:
# Makes Predictions

y_hat = model.predict(x_testing)

In [None]:
print(actions[np.argmax(y_hat[0]])

In [None]:
# Saves Model Weights
saved_model_weights_folder = '../../Data/Models'
model.save(os.path.join(saved_model_weights_folder, 'sign_language_detection.h5'))

In [None]:
model.load_weights(os.path.join(saved_model_weights_folder, 'sign_language_detection.h5'))

In [None]:
# Evaluates The Model Via Confusion Matrix & Accuracy

# Converting results from [1, 0, 0], [0, 1, 0] and [0, 0, 1] to 0, 1 and 2
y_true = np.argmax(y_testing, axis = 1).tolist()
y_hat = np.argmax(y_hat, axis = 1).tolist()

In [None]:
multilabel_confusion_matrix(y_true, y_hat)

In [None]:
def probability_visualization(result, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for number, probability in enumerate(result):
        cv2.rectangle(output_frame, 
                      (0, 60 + number * 40), 
                      (int(probability * 100), 90 + number * 40), 
                      colors[number], 
                      -1)
        cv2.putText(output_frame, 
                    actions[number], 
                    (0, 85 + number * 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 
                    2, cv2.LINE_AA)

In [None]:
# Tests Model In Real Time

sequence = []
sentence = []
predictions = []
threshold = 0.4

image_capture = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence = 0.5, 
                          min_tracking_confidence = 0.5) as holistic:
    
    while image_capture.isOpened():

        result, frame = image_capture.read()

        image, results = mediapipe_detection(frame, holistic)
        
        draw_landmarks(image, results)
        
        # Prediction Logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            prediction = model.predict(np.expand_dims(sequence, axis = 0))[0]
            print(actions[np.argmax(prediction)])
            predictions.append(np.argmax(prediction))

        # Visualization Logic
        
        # Checks the last 10 frames to avoid false detections in a middle of an action
        if np.unique(predictions[-10:] == np.argmax(prediction))
        
            # If maximum prediction is above the threshold
            if prediction[np.argmax(prediction)] > threshold:

                # Continous detection is ongoing, so only append if a new action is being taken
                if len(sentence) > 0:

                    # Check that the current action is not the last action, otherwise known as an ongoing action
                    if actions[np.argmax(prediction)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmx(res)])
                    
        if len(sentence) > 5:
            sentence = sentence[-5:]
        
        # Renders the visualization on the screen
        
        # Variables are starting point, size of box, box color and filling out said box
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ''.join(sentence), (3, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
                  
        cv2.imshow('OpenCV Video Camera Feed', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    
image_capture.realease()