# 1. Importing Libraries

In [1]:
import cv2 #opencv
import numpy as np
import os   #helps with path
from matplotlib import pyplot as plt #to use plt.imshow()
import time              #to measure time between frames 
import mediapipe as mp   


# 2 . drawingutil and hands module

In [2]:
mpDraw=mp.solutions.drawing_utils
mpHands=mp.solutions.hands

# 3. Function to detect points on hands and then drawing on the hands

In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB because by default opencv use bgr but we need rgb for mediapipe to process image
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [4]:
def draw_landmarks(image, results):
    if results.multi_hand_landmarks:
        for num, handsLms in enumerate(results.multi_hand_landmarks):
            mpDraw.draw_landmarks(image,handsLms, mpHands.HAND_CONNECTIONS,
                             mpDraw.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mpDraw.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2) )

# 4. function to get handedness ie left or right and to extract point to numpy array

In [5]:
def get_label(index,results):
    label = None
    for idx, classification in enumerate(results.multi_handedness):
        if classification.classification[0].index == index:
            label = classification.classification[0].label
#         print(index,idx,label)
    if label:
        return label
    else:
        if index == 1:
            return get_label(0,results)
        elif index == 0:
            return get_label(1,results)
        else:
            return label
def extract_keypoints(results):
    hands = [np.zeros(21*3),np.zeros(21*3)]
    if results.multi_hand_landmarks:
        for num, handsLms in enumerate(results.multi_hand_landmarks):
            label = get_label(num,results)
            if label == 'Right':
                hands[0] = np.array([[res.x, res.y, res.z] for res in handsLms.landmark]).flatten()
            if label == 'Left':
                hands[1] = np.array([[res.x, res.y, res.z] for res in handsLms.landmark]).flatten()
            
    return np.concatenate(hands)

# 5. Number of videos of dataset and videolength

In [6]:
# Thirty videos worth of data
no_videos = 200

# Videos are going to be 30 frames in length
video_length = 30

# 6. defining path of dataset and also path where we will store the processed data ie folder p1data in current directory
dataset link <a href="https://drive.google.com/drive/folders/1RZaXXy3pr7YLSv1jKFXDgmNC2nzkgV1P?usp=sharing">DATASET<a>

In [8]:
cwd = os.getcwd()
print(cwd)

C:\Users\91963\Desktop\HandSignRecognition\DEMO


In [11]:
datapath = os.path.join('KgData/asl4g/train/')
os.listdir(datapath)

['A',
 'B',
 'C',
 'D',
 'del',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'nothing',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'space',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z']

In [12]:
#path for numpy array of data
DATA_PATH = os.path.join('p1Data')

In [13]:
os.listdir(cwd)

['.ipynb_checkpoints', 'HandGestureRecognition.ipynb', 'KgData']

# 7. defining the categories of signs 

In [7]:
categories = np.array(['A','B','C','D','E','F','G','H','I','J','del','space','nothing'])

# 8. making directory to store the processed data in p1Data folder

In [39]:
for category in categories: 
    for i in range(no_videos):
        try: 
            os.makedirs(os.path.join(DATA_PATH,category, str(i)))
        except:
            pass

In [25]:
# os.walk()

# 9. checking data is properly fetching  and also the camera feed

In [73]:
for category in categories:
    path = os.path.join(datapath,category)
    print(category," of Length",len(os.listdir(path)))

A  of Length 8458
B  of Length 8309
C  of Length 8146
D  of Length 7629
E  of Length 7744
F  of Length 8031
G  of Length 7844
H  of Length 7906
I  of Length 7953
J  of Length 7503
del  of Length 6836
space  of Length 7071
nothing  of Length 7030


In [130]:
catArray1 = []
catArray2 = []
t = True
for category in categories:
    path = os.path.join(datapath,category)
    img = cv2.imread(os.path.join(path,os.listdir(path)[1]))
    img = cv2.resize(img,(180,200))
    cv2.putText(img,category,(10,50),cv2.FONT_HERSHEY_SIMPLEX,1, (0,0,255),2, cv2.LINE_AA)
    if t:
        catArray1.append(img)
        t = False
    else:
        t = True
        catArray2.append(img)
    cv2.imshow("frame",img)
    if cv2.waitKey(1000) & 0xFF == ord('q'):
        break
cv2.destroyAllWindows()
    
    

In [131]:
cata = cv2.hconcat(catArray1)
catb = cv2.hconcat(catArray2)
cv2.imshow('categories1',cata)
cv2.imshow('categories2',catb)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [76]:
for category in categories:
    path = os.path.join(datapath,category)
    for imgpath in os.listdir(path):
        img_array = cv2.imread(os.path.join(path,imgpath))
        img_array = cv2.resize(img_array,(480,640))
        image, results = mediapipe_detection(img_array, mpHands.Hands())
        draw_landmarks(image, results)
        cv2.imshow("frame without detection",img_array)
        cv2.imshow("frame with detection",image)
        if cv2.waitKey(1000) & 0xFF == ord('q'):
                    break
cv2.destroyAllWindows()

In [33]:
cap = cv2.VideoCapture(0) #creating video capture object
## Set mediapipe model

while cap.isOpened():
    #reading feed current frame
    ret,frame = cap.read() 
    flipframe = cv2.flip(frame,1)
    cv2.imshow("Open Cv1",flipframe) 
    cv2.imshow('o2',frame)
    if cv2.waitKey(10) & 0xFF == ord('q'): #wait for 34 milli second to check if q is pressed on keyboard
        break 
cap.release() #release our webcame
cv2.destroyAllWindows()

# 10. Extracting keypoints using extract_keypoint funtion and for every frame   #  for every category we are creating 200 videos and every video contains 30 frames.

In [40]:
with mpHands.Hands() as model:
    for category in categories:f
        path = os.path.join(datapath,category)
        adl = os.listdir(path)
        cdl = 0
        for i in range(no_videos):
            b = False
            for j in range(video_length):
                image = cv2.imread(os.path.join(path,adl[cdl]))
#                 image = cv2.flip(image,1)
                image = cv2.resize(image,(640,480))
                image, results = mediapipe_detection(image, model)
                draw_landmarks(image, results)
#                 print(results.multi_handedness)
                cdl += 1
                cv2.putText(image,'f no {} Vno {} c {}'.format(j,i,category), (10,20), 
                                   cv2.FONT_HERSHEY_SIMPLEX,0.5, (150, 0, 255), 1, cv2.LINE_AA)
                cv2.imshow("frame",image)
                # NEW Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, category, str(i), str(j))
                np.save(npy_path, keypoints)
                
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    b = True
                    break
            if b:
                break


    cv2.destroyAllWindows()

# 11. converting categories using one hot encoding and splitting data into train and test 

In [41]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
label_map = {label:num for num, label in enumerate(categories)}

In [42]:
label_map

{'A': 0,
 'B': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 'F': 5,
 'G': 6,
 'H': 7,
 'I': 8,
 'J': 9,
 'del': 10,
 'space': 11,
 'nothing': 12}

In [43]:
videos, labels = [], []
for category in categories:
    for video in range(no_videos):
        window = []
        for frame_num in range(video_length):
            res = np.load(os.path.join(DATA_PATH, category, str(video), "{}.npy".format(frame_num)))
            window.append(res)
        videos.append(window)
        labels.append(label_map[category])

In [44]:
X = np.array(videos)

In [45]:
X.shape

(2600, 30, 126)

In [46]:
y = to_categorical(labels).astype(int) #one hot encoding to convert categorial variable

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [48]:
y_test.shape

(520, 13)

In [49]:
X_train.shape

(2080, 30, 126)

# 12. Building model and training

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [9]:
import tensorflow as tf
ACCURACY_THRESHOLD = 0.95
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        # print(logs.get('acc'))
        # print(logs.get('categorical_accuracy'))
        if(logs.get('categorical_accuracy') > ACCURACY_THRESHOLD):
            # print(logs.get('acc'))
            # print(logs.get('categorical_accuracy'))
            print("\nReached %2.2f%% accuracy, so stopping training!!" %(ACCURACY_THRESHOLD*100))
            self.model.stop_training = True

# Instantiate a callback object
callbacks = myCallback()

In [10]:
log_dir = os.path.join('trainLogsdLstm')
tb_callback = TensorBoard(log_dir=log_dir)

In [11]:
mycallbacks = [callbacks,tb_callback]

In [12]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,126)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(categories.shape[0], activation='softmax'))
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])



In [58]:
model.fit(X_train, y_train, epochs=500, callbacks=mycallbacks)

Epoch 1/500
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
E

<tensorflow.python.keras.callbacks.History at 0x1f735c0a0c8>

In [59]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 30, 64)            48896     
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 128)           98816     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2

# 13. Make prediction 

In [60]:
res = model.predict(X_test)

In [61]:
categories[np.argmax(res[0])]

'F'

In [62]:
categories[np.argmax(y_test[0])]

'F'

# 14. Save Model

In [63]:
model.save('model2.h5')

# Load model

In [13]:
model.load_weights('model1.h5')

# 15. Evaluation using Confusion Matrix and Accuracy


In [64]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [65]:
yhat = model.predict(X_test)

In [66]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [67]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[473,   6],
        [  4,  37]],

       [[476,   2],
        [  4,  38]],

       [[475,   2],
        [  5,  38]],

       [[475,   3],
        [  0,  42]],

       [[476,   2],
        [  4,  38]],

       [[485,   2],
        [  1,  32]],

       [[483,   3],
        [  0,  34]],

       [[472,   1],
        [  1,  46]],

       [[474,   1],
        [  7,  38]],

       [[483,   0],
        [  5,  32]],

       [[472,   1],
        [  8,  39]],

       [[487,   2],
        [  3,  28]],

       [[467,  17],
        [  0,  36]]], dtype=int64)

In [68]:
accuracy_score(ytrue, yhat)

0.9192307692307692

# 16. Testing in real Time

In [14]:
colors = [(245,117,16), (117,245,16), (16,117,245),(166,107,245),(106,167,245),(106,117,205),(161,107,25),(130,187,245),(106,117,205),(106,100,245),(196,107,245), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [15]:

back = cv2.imread('board.jpg')
back = cv2.resize(back,(800,800))

In [2]:
# 1. New detection variables
sequence = []
sentence = []
threshold = 0.8

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mpHands.Hands() as hand:
    while cap.isOpened():
        board = back
        # Read feed
        ret, frame = cap.read()
        # Make detections
        image, results = mediapipe_detection(frame,hand)

        
        # Draw landmarks
        draw_landmarks(image, results)
        # 2. Prediction logic
        keypoints = extract_keypoints(results)

        sequence.append(keypoints)
#         print(sequence)
        sequence = sequence[-30:]
#         print(sequence)
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
        #3. Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if categories[np.argmax(res)] != sentence[-1]:
                        sentence.append(categories[np.argmax(res)])
                else:
                    sentence.append(categories[np.argmax(res)])

            if len(sentence) > 30: 
                sentence = sentence[-25:]

            # Viz probabilities
            board = prob_viz(res, categories,board, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ''.join(sentence).replace('-',' '), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)
        cv2.imshow('viz',board)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

NameError: name 'mpHands' is not defined