# 1. Import and Install Dependencies

#### In step 1. we are going to make sure we installed all the nesseray libraries that we need

In [1]:
!pip install tensorflow==2.4.1 opencv-python mediapipe sklearn matplotlib



In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

## 2. Keypoints using MP Holistic

#### In step 2. we're going to make sure that we can access our webcam using opencv and then what we're going to apply a secondary layer in which we're going to make detections using mediapipe

In [2]:
#setting up media pipe holistics

mp_holistic = mp.solutions.holistic #bringing in a holistic model to make our detections
mp_drawing = mp.solutions.drawing_utils #our drawing utilities to draw the previous made detections

In [3]:
# media pip holistic detection function
# first grab the image we convert it from bgr to rgb then set it to unwritable so this saves a little bit of memory
# then we make our detection convert it or set it back to writable 
# then convert it from rgb to bgr so by default when we get a feed from opencv
# it reads that feed in the channel format of bgr so blue green red 
# but when we actually go to make a detection using mediapipe we need it to be in the format of rgb 

def mediapipe_detection(image, model):
    image= cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #color conversion from bgr to rgb
    image.flags.writeable = False #set our image writable status to false 
    results = model.process(image) #detecting using media pip to make predictions
    image.flags.writeable = True #image is now writable again
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) #color conversion from rgb to bgr
    return image, results

In [4]:
#we're going to grab the results from the detecting model 
#and render them onto the image so we can actually see our different landmarks

def draw_landmarks(image, results):
    #helper function that comes with mediapipe that makes it easier to draw landmarks onto an image 
    #mp_holistic shows what landmark is connected to what other landmarks
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS)#draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)#draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)#draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)#draw right hand connections

In [5]:
#same as the drawing function but with some updates on the formatting of the landmarks
#specifiying the colors,thickniss and radious

def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color= (80, 110, 10), thickness=1, circle_radius=1),
                              mp_drawing.DrawingSpec(color= (80, 256, 121), thickness=1, circle_radius=1),
                             )
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color= (80, 22, 10), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color= (80, 44, 121), thickness=2, circle_radius=2),                              
                             )
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color= (121, 22, 76), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color= (121, 44, 250), thickness=2, circle_radius=2),                              
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color= (245, 117, 66), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color= (245, 66, 230), thickness=2, circle_radius=2),                              
                             )       

In [7]:
#we're doing to set up a video capture then loop through every single frame and render it to the screen 
#so the looping frames is going to look like a video as the basic idea of a video is, just multiple frames stacked together 

cap = cv2.VideoCapture(0) #accessing our webcam

#with statement to be able to access media pip holistic model
#the model will make an initial detection and then from there it'll track the key points
                           #our intial detection          #tracking confidance
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    while cap.isOpened(): #double checking that we're still accessing our webcam

        ret, frame = cap.read() #reading the feed of the webcam

        image, results= mediapipe_detection(frame, holistic) #make prediction using media pip
        print(results)
        
        draw_styled_landmarks(image, results)#draw styled landmarks with different colors function above

        cv2.imshow('Project Feed', image) #show to the frame (image) to the screen

        if cv2.waitKey(10) & 0xFF == ord('q'): #breaking the loop by waiting for a key ('q') to be pressed
            break

    cap.release() #release our webcam
    cv2.destroyAllWindows()#closing down our frame
    cv2.waitKey(1)

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [8]:
draw_landmarks(frame, results)

In [11]:
plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

## 3. Extract Keypoint Values

#### In step 3. extracting the keypoint values into a format that we're able to use

In [6]:
def extract_keypoints(results):
    #convert it all into one big array to be in this particular format when pass it to our lstm model
    pose= np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    #it is going to give an error if we don't have our hand in the frame so we are going to replace the blank array with 0s one 
    #so if we have results it will extrct those values else it will replace it with zero array
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    #oncatenate pose face left hand and right hand to use those key points to actually do our sign language detection
    return np.concatenate([pose, face, lh, rh])

## 4. Setup Folders for Collection

#### Step 4. setting up our folders for our array collection so to output a result of these key points so our key points are effectively going to form our frame values to use those extracted key points to go and decode our sign language 

In [7]:
DATA_PATH = os.path.join('Signs_Data') #path for the exported data(numpy arrays)

actions = np.array(['Hello', 'Goodbye', 'Nice To Meet You', 'Thanks', 'House'])#actions that we try to detect

no_sequences = 50 #collecting 50 videos of data for each action

sequence_length = 25 #25 frames,25 different sets of key points to be able to classify that action

In [11]:
#store our data
#create one folder for each action
#and within each folder we are going to have a folder for each sequance of action so 50 folders

#loop through all of our different actions
for action in actions:
    #loop through the 50 different videos that we're going to be collecting or 50 different frame sets
    for sequence in range(no_sequences):
        try:
            #create a new folder called mp_data and then it's going to create a sub folder per action
            #then it'll create a sequence folder
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

## 5. Collect Keypoint Values for Training and Testing

#### Step 5. it will take a snapshot at each point in time so we're going to loop through each one of our actions and collect our actions and then loop through and collect a set of frames per video (25 frames per video) then collect 50 videos and we're going to do that three times for each action

#### it will say starting collection we'll then get two seconds to get into position then perform our action for 30 frames it'll then go to starting collection again so we're going to do that 30 times per action and that will give us 30 frames for 30 sequences for each individual of our three different actions 

In [None]:
cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    #loop through our actions
    for action in actions:
        #loop through our sequences(videos:50)
        for sequence in range(no_sequences):
             #loop through our sequences length(video length:25)
            for frame_num in range(sequence_length):
                #read the feed
                ret, frame = cap.read()
                #make detections
                image, results= mediapipe_detection(frame, holistic)

                #draw formatted landmarks
                draw_styled_landmarks(image, results)
                #for each video if we're at frame 0 we're going to take a break(2000: 2 sec)
                if frame_num == 0:
                    #outputting text to our screen
                    #print started collecting
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                    #print the action we are collecting for and the number of video we are at
                    cv2.putText(image, 'Collecting Frames for {} Video Number {}'.format(action, sequence), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)  
                    cv2.imshow('Project Feed', image)
                    #break at 2secs
                    cv2.waitKey(2000)
                    
                else:
                    cv2.putText(image, 'Collecting Frames for {} Video Number {}'.format(action, sequence), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)  
                    cv2.imshow('Project Feed', image)
                
                #apply keypoint extraction    
                Keypoints = extract_keypoints(results)
                #save the keypoints values to the folders
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, Keypoints)
                #exit when ('q') is pressed
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
    #release the webcame
    cap.release()
    #close the frame window
    cv2.destroyAllWindows()
    cv2.waitKey(1)

## 6. Preprocess Data and Create Labels and Features

#### Step 6. import train test split from scikit learn to allow us to create a training and a testing sets then we're going to import the two categorical function from keras utilities to help us with our labels

In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [9]:
#create a label array or a label dictionary to represent each one of our different actions
label_map = {label:num for num, label in enumerate(actions)}

In [10]:
label_map

{'Hello': 0, 'Goodbye': 1, 'Nice To Meet You': 2, 'Thanks': 3, 'House': 4}

In [11]:
#creating two blank arrays for sequences and labels
#sequences is going to represent our feature data or our x data
#labels is effectively going to represent our labels or our y data
#use our features and train a model to represent the relationship between our labels
sequences, labels = [], []
#loop through each of our actions
for action in actions:
    #loop through each of our sequences(videos:50)
    for sequence in range(no_sequences):
        #blank array to represent all of the different frames that we got for that particular sequence
        window = []
        #loop through each one of the frames(25 frames length)
        for frame_num in range(sequence_length):
            #load up that frame and add it to the window
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
            
            
        sequences.append(window)
        labels.append(label_map[action])


In [12]:
np.array(sequences).shape

(250, 25, 1662)

In [13]:
np.array(labels).shape

(250,)

In [14]:
X = np.array(sequences)

In [15]:
X.shape

(250, 25, 1662)

In [16]:
y = to_categorical(labels).astype(int)

In [17]:
y

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1]])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05)

In [19]:
y_train.shape

(237, 5)

## 7. Build and Train LSTM Neural Network

#### Step 7. training our lstm neural network

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [26]:
#tensorboard callback is a web app that's offered as part of the tensorflow package
#that allows you to monitor your neural network training and accuracy
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir = log_dir)

In [21]:
#neural network set up

#first instantiating the model 
#using Sequential makes it easy to build up your model
model = Sequential()

#adding three sets of lstm layers
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(25,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))

#adding three sets of dense layers
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [22]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics= ['categorical_accuracy'])

In [29]:
model.fit(X_train, y_train, epochs=300, callbacks=[tb_callback])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

KeyboardInterrupt: 

In [30]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 25, 64)            442112    
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 128)           98816     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 165       
Total params: 596,741
Trainable params: 596,741
Non-trainable params: 0
__________________________________________________

## 8. Make predictions

#### Step 8. making predictions

In [24]:
res = model.predict(X_test)

In [25]:
actions[np.argmax(res[2])]

'Goodbye'

In [26]:
actions[np.argmax(y_test[2])]

'Goodbye'

## 9. Save Weights

#### Step 9. save our model 

In [36]:
#save model
model.save('sign_language_model.h5')

In [None]:
#delete model
del model

In [23]:
#reloading model
model.load_weights('sign_language_model.h5')

## 10. Evaluation using Confusion Matrix and Accuracy

#### Step 10. evaluation to see how this model is performing. import a couple of metrics from scikit learn to evaluate the performance of the model


In [37]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [38]:
yhat = model.predict(X_train)

In [39]:
ytrue = np.argmax(y_train, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [40]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[188,   0],
        [  5,  44]],

       [[187,   5],
        [  0,  45]],

       [[189,   0],
        [  0,  48]],

       [[191,   0],
        [  0,  46]],

       [[188,   0],
        [  0,  49]]])

In [41]:
accuracy_score(ytrue, yhat)

0.9789029535864979

## 11. Test in Real Time

#### Step 11. perform a real time detector

In [27]:
#coloring the action strings
colors = [(245,117,16), (117,245,16), (16,117,245), (204,204,255), (255, 255, 102)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    #all of our different probabilities
    for num, prob in enumerate(res):
        #drawing a dynamic rectangle
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        #output text
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
       
    #cv2.rectangle(output_frame, (0,60+np.argmax(res)*40), (int(prob*100), 90+num*40), colors[num], -1)
    return output_frame

In [28]:
#collect our 25 frames in order to be able to generate a prediction
sequence = []
#allow us to concatenate our history of detections together 
sentence = []
#
predictions = []
#render results if they're above a certain threshold
threshold = 0.8

cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        #read feed
        ret, frame = cap.read()
        #make predictions
        image, results = mediapipe_detection(frame, holistic)
        #draw formated landmarks
        draw_styled_landmarks(image, results)
        #prediction logic
        #extracting keypoints
        keypoints = extract_keypoints(results)
        #appending our key points to the end of the sequance
        sequence.append(keypoints)
        #to grab our last 25 frames to be able to generate our prediction 
        sequence = sequence[-25:]
        #if the length of length of the sequence=25 then and only then will we run a prediction model
        if len(sequence) == 25:
            #generate prediction model
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                #visualization logic
                #check if our result is above the threshold
                if res[np.argmax(res)] > threshold: 
                    #checking whether or not we've more than a certain number of words or whether or not we've got words
                    if len(sentence) > 0:
                        #checking if the current action does not equal the last sentence in our string to avoid double
                        if actions[np.argmax(res)] != sentence[-1]:
                            #append the current detected action onto our sequatnce array
                            sentence.append(actions[np.argmax(res)])
                    else:
                        #append the initial action
                        sentence.append(actions[np.argmax(res)])
            if len(sentence) > 5: 
                #grabbing the last five values don't end up with this giant array to render
                sentence = sentence[-5:]

                
            image = prob_viz(res, actions, image, colors)
            
        #show the screen
        cv2.imshow('Project Feed', image)
        #break if ('q') is pressed
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
            
    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)

Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Goodbye
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
Hello
House
House
House
House
House
House
Nice To Meet You
Nice To Meet You
Nice To Meet You
Nice To Meet You
Goodbye
Hello
Hello
Hello
Goodbye
Hello
Hello
Hello
Hello
Hello
House
House
House
House
House
Goodbye
Goodbye
Hello
Hello
Hello
Hello
Hello
Hello
House
Goodbye
Goodbye
Hello
Hello
House
House
House
House
House
House
House
House
House
Nice To Meet You
Goodbye
Goodbye
Goodbye
Hello
Hello
Hello
Hello
House
House
House
House
House
House
House
House
House
House
House
House
House
Goodbye
Nice To Meet You
Nice To Meet You
Nice To Mee