Importing Dependencies

In [None]:
import cv2
import time
import mediapipe as mp
import numpy as np
import matplotlib.pyplot as plt
import os

import pandas as pd
import torch
import torch.nn as nn
import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.utils import to_categorical

from tqdm import tqdm
from keras import regularizers

Defining necessary functions

In [None]:
#Handland takes the mediapipe output landmarks, collates all the coordinates and flattens it to create an array of size 63

def handland(results):
    hl = np.array([[landmark.x,landmark.y,landmark.z] for landmark in results.multi_hand_landmarks[0].landmark]).flatten() if results.multi_hand_landmarks else np.zeros(63)
    return hl

def handscore(results):
    sc = results.multi_handedness[0].classification[0].score
    return sc

#Choose is basically the combined model. We have three different custom models and the combination of the three seems to be working better.

def choose(a,b,c):
    if a != '':
        a = ord(a)
        b = ord(b)
        c = ord(c)
        if (b-c) == 0:
            return chr(b)
        else:
            return chr(a)
    else:
        return ''


#Augmentation Function

from PIL import Image as PILImage
def augment_image(image):
    # Read the original image
    original_image = image

    # Convert image from BGR to RGB
    original_image_rgb = cv2.cvtColor(original_image, cv2.COLOR_RGB2BGR)

    # Initialize the list to store augmented images
    augmented_images = [original_image_rgb]

    # Flip the image horizontally
    # flipped_image = cv2.flip(original_image_rgb, 1)
    # augmented_images.append(flipped_image)

    # Rotate the image by custom angles
    rotation_angles = np.arange(-15,16,10)
    for angle in rotation_angles:
        rotated_image = PILImage.fromarray(original_image_rgb)
        rotated_image = rotated_image.rotate(angle)
        rotated_image = np.array(rotated_image)
        augmented_images.append(rotated_image)

    # GaussianBlur the image

    blurred_image = cv2.GaussianBlur(original_image_rgb, (5 , 5), 0)
    augmented_images.append(blurred_image)

    return augmented_images

Mediapipe Functions

In [None]:
# Grabbing the Holistic Model from Mediapipe and
# Initializing the Model
mp_holistic = mp.solutions.holistic
holistic_model = mp_holistic.Holistic(
	min_detection_confidence=0.1,
	min_tracking_confidence=0.1
)

# Initializing the drawing utils for drawing the facial landmarks on image
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
mp_drawing_styles = mp.solutions.drawing_styles

Loading the model weights

In [None]:
#We load the weights for all the 3 custom models here

#The efficiency of the model is decreasing with increase in model number i.e. Model 1 is suppposed to be the best and Model 3 is worst
#Model 1 is trained on the custom train dataset we created
#Model 2 is trained on the Augmented Dataset we obtained from the Yolo  Train Data
#Model 3 is trained on a much bigger dataset + augmentation, but the quality of the dataset was not up to the mark and it often used wrong signs

def baseline_model():
  model = Sequential()
  model.add(Dense(46, input_dim=63, activation='relu', kernel_regularizer=regularizers.l1(0.01)))
  model.add(Dense(26, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

model1 = baseline_model()
model1.load_weights("wcdata_2layers.h5")
model2 = baseline_model()
model2.load_weights("augdata_2layer.h5")
model3 = baseline_model()
model3.load_weights("bigdataASLNN26.h5")

Streaming from webcam and using the combined model to predict the sign

In [None]:
#We are in this version of code using combined model to generate the output. But in order to change it to any particular model, change the second argument of cv2.putText to sout1,sout2 or sout3 accordingly.

L = []
Con = []

# For webcam input:
cap = cv2.VideoCapture(0)
with mp_hands.Hands(
    max_num_hands = 1,
    model_complexity=0,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5) as hands:
  previousTime = 0
  currentTime = 0
  while cap.isOpened():
    success, image = cap.read()
    if not success:
      print("Ignoring empty camera frame.")
      # If loading a video, use 'break' instead of 'continue'.
      continue

    # To improve performance, optionally mark the image as not writeable to
    # pass by reference.
    image = cv2.flip(image,1)
    image.flags.writeable = False
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image)
    check = 0
    if results.multi_hand_landmarks:
        H = handland(results)
        #H = feature_create(H)
        H = np.reshape(H,(1,63))
        check = 1

    if check == 1:
        sout1 = chr(np.argmax(model1(H)) + 65)
        sout2 = chr(np.argmax(model2(H)) + 65)
        sout3 = chr(np.argmax(model3(H)) + 65)
        Con.append(handscore(results))
        L.append(sout)
    else:
        sout1 = ''
        sout2 = ''
        sout3 = ''


    # Calculating the FPS
    currentTime = time.time()
    fps = 1 / (currentTime-previousTime)
    previousTime = currentTime

    # Displaying FPS on the image
    image = cv2.flip(image,1)
    cv2.putText(image, choose(sout1,sout2,sout3) , (10, 70), cv2.FONT_HERSHEY_COMPLEX, 1, (0,255,0), 2)
    image = cv2.flip(image,1)

    # Draw the hand annotations on the image.
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    if results.multi_hand_landmarks:
      for hand_landmarks in results.multi_hand_landmarks:
        mp_drawing.draw_landmarks(
            image,
            hand_landmarks,
            mp_hands.HAND_CONNECTIONS,
            mp_drawing_styles.get_default_hand_landmarks_style(),
            mp_drawing_styles.get_default_hand_connections_style())
    # Flip the image horizontally for a selfie-view display.
    cv2.imshow('MediaPipe Hands', cv2.flip(image, 1))
    if cv2.waitKey(5) & 0xFF == ord('q'):
      break
cap.release()
cv2.destroyAllWindows()