In [None]:
# 1. Do this the first time only to install dependencies
!pip install tensorflow opencv-python scikit-learn numpy mediapipe

In [None]:
# 2
import cv2
import tensorflow as tf
import mediapipe as mp
import os
import numpy as np
from matplotlib import pyplot as plt
import time

## Initializing Detection Model and Drawing Util

In [None]:
# 3
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

## Defining detection model and drawing tool

In [None]:
# 4
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image, results

In [None]:
# 5
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

## Execution Test

In [None]:
cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while True:
        ret, frame = cap.read()
    
        image, results = mediapipe_detection(frame, holistic)
        draw_landmarks(image, results)
        
        cv2.imshow("OpenCV Feed", image)
        
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
cap.release()
cv2.destroyAllWindows()

## Extracting keypoints

In [None]:
# 6
def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([lh, rh])

## Directories for collecting Data

In [None]:
# 7
data_dir = os.path.join('data')

# J and Z are not includede (motion based alphabets)
letters = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y'])  
no_sequences = 25                   # (25 lh, 25rh) = total(50)
sequence_length = 10

In [None]:
for letter in letters:
    # for sequence in range(25, 50):   [use this for the first half of the data(eg: left hand[25] then right hand[25])]
    for sequence in range(0, 25):
        try:
            os.makedirs(os.path.join(data_dir, letter, str(sequence)))
        except:
            pass

## Collecting data from the webcam

In [None]:
cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for letter in letters:
        flag = True
        # for sequence in range(0, 24):      [use this for the first hand keypoints]
        for sequence in range(0, 25):
            for frame_number in range(sequence_length):
                ret, frame = cap.read()
            
                image, results = mediapipe_detection(frame, holistic)
                draw_landmarks(image, results)
                
                if frame_number == 0:
                    while(flag):
                        cv2.putText(image, 'START Collection for letter \'{}\', press S.'.format(letter), (120, 200), 
                                cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)
                        cv2.imshow("OpenCV Feed", image)
                        if cv2.waitKey(1) & 0xFF == ord('s'):
                            flag = False
                    cv2.putText(image, 'Collecting for letter \'{}\' video number {}'.format(letter, sequence), (120, 200), 
                                cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)
                else:
                    cv2.putText(image, 'Collecting for letter \'{}\' video number {}'.format(letter, sequence), (120, 200), 
                                cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)

                keypoints = extract_keypoints(results)
                np_path = os.path.join(data_dir, letter, str(sequence), str(frame_number))
                np.save(np_path, keypoints)
                
                cv2.imshow("OpenCV Feed", image)
                
                
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
            
cap.release()
cv2.destroyAllWindows()

In [None]:
cap.release()
cv2.destroyAllWindows()

## Preprocess the collected data

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [None]:
label_map = {label: num for num, label in enumerate(letters)}

In [None]:
sequences, labels = [], []
for letter in letters:
    for sequence in range(no_sequences):
        window = []
        for frame_number in range(sequence_length):
            res = np.load(os.path.join(data_dir, letter, str(sequence), '{}.npy'.format(frame_number)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[letter])

In [None]:
np.array(sequences).shape

In [None]:
np.array(labels).shape

In [None]:
X = np.array(sequences)
X.shape

In [None]:
y = to_categorical(labels).astype(int)
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [None]:
y_test.shape

## Build and Train LSTM model

In [None]:
# 8
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [None]:
# 9
log_dir = os.path.join('logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [None]:
# 10
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(10, 126)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(letters.shape[0], activation='softmax'))

In [None]:
model.summary()

In [None]:
# 11
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
# 12
model.load_weights("asl.h5")

In [None]:
model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])

## Make predictions

In [None]:
res = model.predict(X_test)

In [None]:
letters[np.argmax(res[29])]

In [None]:
letters[np.argmax(y_test[29])]

In [None]:
np.argmax(res[0])

## Save Model

In [None]:
model.save('asl.h5')
model.save('asl.keras')

## Evaluate the Model

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [None]:
yhat = model.predict(X_test)

In [None]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue, yhat)

In [None]:
accuracy_score(ytrue, yhat)

## Test Model

In [None]:
# 13 Main execution part

predictions = []
sequence = []
sentence = []
threshold = 0.8

cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
    
        image, results = mediapipe_detection(frame, holistic)
        draw_landmarks(image, results)

        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-10:]

        if len(sequence) == 10:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(letters[np.argmax(res)])
            predictions.append(np.argmax(res))

            if np.unique(predictions[-10:])[0] == np.argmax(res):
                if res[np.argmax(res)] > threshold:
                    letter = letters[np.argmax(res)]
                else:
                    letter = 'no input'

        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, letter, (3, 30), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        cv2.imshow("ASL Recognition", image)
        
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
cap.release()
cv2.destroyAllWindows()

In [None]:
## 14 --- Use this in case webcam is still running after closing the program 
cap.release()
cv2.destroyAllWindows()