Real-time Hand Gesture Recognition with Speech,Text Feedback using Custom Model 

In [2]:
#Import necessary libraries

import cv2
import numpy as np
import tensorflow as tf
from gtts import gTTS
import pygame

# Load the pre-trained model
model = tf.keras.models.load_model('/Users/sujaykaushik/Documents/Msc DataScience/Project Dissertation/Extra  files/OwnModel_15Oct_A_Z_1_9.h5')

# Define the coordinates of the region of interest (ROI) box
roi_x, roi_y, roi_width, roi_height = 100, 100, 250, 250

# Initialize the webcam
cap = cv2.VideoCapture(0)

# Initialize pygame for audio playback
pygame.mixer.init()

# Initialize previous gesture label
prev_gesture_label = None

# Create a text file to save the recognized output
text_file = open("recognized_output_2.txt", "w")

# Create a separate window for displaying the hand region
cv2.namedWindow('Hand Region', cv2.WINDOW_NORMAL)
cv2.resizeWindow('Hand Region', roi_width, roi_height)

while True:
    # Capture a frame from the webcam
    ret, frame = cap.read()

    if not ret:
        break

    # Convert the frame to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Apply binary thresholding to convert to black and white
    _, bw_image = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY)

    # Invert the colors (hand appears white, background appears black)
    inverted_bw_image = cv2.bitwise_not(bw_image)

    # Extract the hand region within the ROI
    hand_roi = inverted_bw_image[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width]

    # Convert hand_roi to a 3D array for compatibility
    hand_roi_colored = cv2.cvtColor(hand_roi, cv2.COLOR_GRAY2BGR)

    # Create an empty black image for the bounding box part
    bounding_box_image = np.zeros_like(frame)

    # Overlay the hand_roi on the bounding_box_image
    bounding_box_image[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width] = hand_roi_colored

    # Display the bounding box part as binary black and white
    cv2.imshow('Bounding Box Binary Image', bounding_box_image)

    # Display the hand region in a separate window
    cv2.imshow('Hand Region', hand_roi_colored)

    # Preprocess the hand_roi image for prediction
    resized_hand_roi = cv2.resize(hand_roi, (32, 32))  # Resize to (32, 32)
    rgb_hand_roi = cv2.cvtColor(resized_hand_roi, cv2.COLOR_GRAY2RGB)  # Convert to RGB

    # Normalize the RGB values to the range [0, 1]
    normalized_hand_roi = rgb_hand_roi / 255.0

    # Perform gesture prediction using the pre-trained model
    predictions = model.predict(np.expand_dims(normalized_hand_roi, axis=0))
    predicted_gesture_index = np.argmax(predictions)

    # Get the predicted gesture label
    gestures = ['0','1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    predicted_gesture_label = gestures[predicted_gesture_index]

    if predicted_gesture_label != prev_gesture_label:
        # Convert predicted gesture label to speech
        tts = gTTS(text=predicted_gesture_label, lang='en')
        tts.save('predicted_audio.mp3')

        # Play the saved audio file using pygame
        pygame.mixer.music.load('predicted_audio.mp3')
        pygame.mixer.music.play()

        # Write the predicted gesture to the text file
        text_file.write(predicted_gesture_label + '\n')
        print(f"Recognized Gesture: {predicted_gesture_label}")

    # Update the previous gesture label
    prev_gesture_label = predicted_gesture_label

    # Draw the ROI box on the frame
    cv2.rectangle(frame, (roi_x, roi_y), (roi_x + roi_width, roi_y + roi_height), (0, 255, 0), 2)

    # Display the predicted gesture on the frame
    cv2.putText(frame, predicted_gesture_label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the frame with ROI and predicted gesture
    cv2.imshow('Gesture Prediction', frame)

    # Press 'q' to quit the capture loop
    key = cv2.waitKey(1)
    if key == ord('q'):
        break

# Release the webcam and close the windows
text_file.close()
cap.release()
cv2.destroyAllWindows()








Recognized Gesture: J
Recognized Gesture: V
Recognized Gesture: J
Recognized Gesture: 2
Recognized Gesture: V
Recognized Gesture: 8
Recognized Gesture: V
Recognized Gesture: 4
Recognized Gesture: K
Recognized Gesture: 5
Recognized Gesture: 3
Recognized Gesture: K
Recognized Gesture: 3
Recognized Gesture: 5
Recognized Gesture: 2
Recognized Gesture: 5
Recognized Gesture: 4
Recognized Gesture: 5
Recognized Gesture: 4
Recognized Gesture: 3
Recognized Gesture: 4
Recognized Gesture: 3
Recognized Gesture: 4
Recognized Gesture: U
Recognized Gesture: 1
Recognized Gesture: 6
Recognized Gesture: 1
Recognized Gesture: 4
Recognized Gesture: U
Recognized Gesture: 4
Recognized Gesture: 3
Recognized Gesture: 4
Recognized Gesture: 3
Recognized Gesture: 6
Recognized Gesture: L
Recognized Gesture: R
Recognized Gesture: U
Recognized Gesture: 6
Recognized Gesture: R
Recognized Gesture: 6
Recognized Gesture: U
Recognized Gesture: 6
Recognized Gesture: Z
Recognized Gesture: W
Recognized Gesture: H
Recognized

Real-time Hand Gesture Recognition with Speech,Text Feedback using VGG-16 Model 

In [3]:
#Import necessary libraries

import cv2
import numpy as np
import tensorflow as tf
from gtts import gTTS
import pygame

# Load the pre-trained model
model = tf.keras.models.load_model('/Users/sujaykaushik/Documents/Msc DataScience/Project Dissertation/Extra  files/VGG16_Dec17.h5')

# Define the coordinates of the region of interest (ROI) box
roi_x, roi_y, roi_width, roi_height = 100, 100, 250, 250

# Initialize the webcam
cap = cv2.VideoCapture(0)

# Initialize pygame for audio playback
pygame.mixer.init()

# Initialize previous gesture label
prev_gesture_label = None

# Create a text file to save the recognized output
text_file = open("recognized_output_2.txt", "w")

# Create a separate window for displaying the hand region
cv2.namedWindow('Hand Region', cv2.WINDOW_NORMAL)
cv2.resizeWindow('Hand Region', roi_width, roi_height)

while True:
    # Capture a frame from the webcam
    ret, frame = cap.read()

    if not ret:
        break

    # Convert the frame to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Apply binary thresholding to convert to black and white
    _, bw_image = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY)

    # Invert the colors (hand appears white, background appears black)
    inverted_bw_image = cv2.bitwise_not(bw_image)

    # Extract the hand region within the ROI
    hand_roi = inverted_bw_image[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width]

    # Convert hand_roi to a 3D array for compatibility
    hand_roi_colored = cv2.cvtColor(hand_roi, cv2.COLOR_GRAY2BGR)

    # Create an empty black image for the bounding box part
    bounding_box_image = np.zeros_like(frame)

    # Overlay the hand_roi on the bounding_box_image
    bounding_box_image[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width] = hand_roi_colored

    # Display the bounding box part as binary black and white
    cv2.imshow('Bounding Box Binary Image', bounding_box_image)

    # Display the hand region in a separate window
    cv2.imshow('Hand Region', hand_roi_colored)

    # Preprocess the hand_roi image for prediction
    resized_hand_roi = cv2.resize(hand_roi, (32, 32))  # Resize to (32, 32)
    rgb_hand_roi = cv2.cvtColor(resized_hand_roi, cv2.COLOR_GRAY2RGB)  # Convert to RGB

    # Normalize the RGB values to the range [0, 1]
    normalized_hand_roi = rgb_hand_roi / 255.0

    # Perform gesture prediction using the pre-trained model
    predictions = model.predict(np.expand_dims(normalized_hand_roi, axis=0))
    predicted_gesture_index = np.argmax(predictions)

    # Get the predicted gesture label
    gestures = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    predicted_gesture_label = gestures[predicted_gesture_index]

    if predicted_gesture_label != prev_gesture_label:
        # Convert predicted gesture label to speech
        tts = gTTS(text=predicted_gesture_label, lang='en')
        tts.save('predicted_audio.mp3')

        # Play the saved audio file using pygame
        pygame.mixer.music.load('predicted_audio.mp3')
        pygame.mixer.music.play()

        # Write the predicted gesture to the text file
        text_file.write(predicted_gesture_label + '\n')
        print(f"Recognized Gesture: {predicted_gesture_label}")

    # Update the previous gesture label
    prev_gesture_label = predicted_gesture_label

    # Draw the ROI box on the frame
    cv2.rectangle(frame, (roi_x, roi_y), (roi_x + roi_width, roi_y + roi_height), (0, 255, 0), 2)

    # Display the predicted gesture on the frame
    cv2.putText(frame, predicted_gesture_label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the frame with ROI and predicted gesture
    cv2.imshow('Gesture Prediction', frame)

    # Press 'q' to quit the capture loop
    key = cv2.waitKey(1)
    if key == ord('q'):
        break

# Release the webcam and close the windows
text_file.close()
cap.release()
cv2.destroyAllWindows()








Recognized Gesture: V
Recognized Gesture: C
Recognized Gesture: V
Recognized Gesture: J
Recognized Gesture: V
Recognized Gesture: 1
Recognized Gesture: V
Recognized Gesture: 2
Recognized Gesture: V


In [2]:
#Import necessary libraries

import cv2
import numpy as np
import tensorflow as tf
from gtts import gTTS
import pygame

# Load the pre-trained model
model = tf.keras.models.load_model('/Users/sujaykaushik/Documents/Msc DataScience/Project Dissertation/All Files (Code & Dataset)/MyModel/Model.h5')

# Define the coordinates of the region of interest (ROI) box
roi_x, roi_y, roi_width, roi_height = 100, 100, 250, 250

# Initialize the webcam
cap = cv2.VideoCapture(1)

# Initialize pygame for audio playback
pygame.mixer.init()

# Initialize previous gesture label
prev_gesture_label = None

# Create a text file to save the recognized output
text_file = open("recognized_output_2.txt", "w")

# Create a separate window for displaying the hand region
cv2.namedWindow('Hand Region', cv2.WINDOW_NORMAL)
cv2.resizeWindow('Hand Region', roi_width, roi_height)

while True:
    # Capture a frame from the webcam
    ret, frame = cap.read()

    if not ret:
        break

    # Convert the frame to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Apply binary thresholding to convert to black and white
    _, bw_image = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY)

    # Invert the colors (hand appears white, background appears black)
    inverted_bw_image = cv2.bitwise_not(bw_image)

    # Extract the hand region within the ROI
    hand_roi = inverted_bw_image[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width]

    # Convert hand_roi to a 3D array for compatibility
    hand_roi_colored = cv2.cvtColor(hand_roi, cv2.COLOR_GRAY2BGR)

    # Create an empty black image for the bounding box part
    bounding_box_image = np.zeros_like(frame)

    # Overlay the hand_roi on the bounding_box_image
    bounding_box_image[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width] = hand_roi_colored

    # Display the bounding box part as binary black and white
    cv2.imshow('Bounding Box Binary Image', bounding_box_image)

    # Display the hand region in a separate window
    cv2.imshow('Hand Region', hand_roi_colored)

    # Preprocess the hand_roi image for prediction
    resized_hand_roi = cv2.resize(hand_roi, (32,32))  # Resize to (32, 32)
    rgb_hand_roi = cv2.cvtColor(resized_hand_roi, cv2.COLOR_GRAY2RGB)  # Convert to RGB

    # Normalize the RGB values to the range [0, 1]
    normalized_hand_roi = rgb_hand_roi / 255.0

    # Perform gesture prediction using the pre-trained model
    predictions = model.predict(np.expand_dims(normalized_hand_roi, axis=0))
    predicted_gesture_index = np.argmax(predictions)

    # Get the predicted gesture label
    gestures = ['0','1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    predicted_gesture_label = gestures[predicted_gesture_index]

    if predicted_gesture_label != prev_gesture_label:
        # Convert predicted gesture label to speech
        tts = gTTS(text=predicted_gesture_label, lang='en')
        tts.save('predicted_audio.mp3')

        # Play the saved audio file using pygame
        pygame.mixer.music.load('predicted_audio.mp3')
        pygame.mixer.music.play()

        # Write the predicted gesture to the text file
        text_file.write(predicted_gesture_label + '\n')
        print(f"Recognized Gesture: {predicted_gesture_label}")

    # Update the previous gesture label
    prev_gesture_label = predicted_gesture_label

    # Draw the ROI box on the frame
    cv2.rectangle(frame, (roi_x, roi_y), (roi_x + roi_width, roi_y + roi_height), (0, 255, 0), 2)

    # Display the predicted gesture on the frame
    cv2.putText(frame, predicted_gesture_label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the frame with ROI and predicted gesture
    cv2.imshow('Gesture Prediction', frame)

    # Press 'q' to quit the capture loop
    key = cv2.waitKey(1)
    if key == ord('q'):
        break

# Release the webcam and close the windows
text_file.close()
cap.release()
cv2.destroyAllWindows()


pygame 2.5.0 (SDL 2.28.0, Python 3.11.1)
Hello from the pygame community. https://www.pygame.org/contribute.html




Recognized Gesture: L




Recognized Gesture: V
Recognized Gesture: L
Recognized Gesture: 1
Recognized Gesture: V
Recognized Gesture: L
Recognized Gesture: V
Recognized Gesture: 1
Recognized Gesture: L
Recognized Gesture: V
Recognized Gesture: 1
Recognized Gesture: L
Recognized Gesture: V
Recognized Gesture: 2
Recognized Gesture: 3
Recognized Gesture: V
Recognized Gesture: 3
Recognized Gesture: 8
Recognized Gesture: V
Recognized Gesture: 2
Recognized Gesture: 3
Recognized Gesture: V
Recognized Gesture: 2
Recognized Gesture: V
Recognized Gesture: 3
Recognized Gesture: V
Recognized Gesture: 3
Recognized Gesture: 2
Recognized Gesture: 3
Recognized Gesture: 2
Recognized Gesture: 3
Recognized Gesture: V
Recognized Gesture: 3
Recognized Gesture: 5
Recognized Gesture: 8
Recognized Gesture: 3
Recognized Gesture: 8
Recognized Gesture: 2
Recognized Gesture: V
Recognized Gesture: 2
Recognized Gesture: V
Recognized Gesture: 2
Recognized Gesture: V
Recognized Gesture: 2
Recognized Gesture: V
Recognized Gesture: 2
Recognized

KeyboardInterrupt: 

: 