In [5]:
pip install gtts playsound


Note: you may need to restart the kernel to use updated packages.


In [6]:
import cv2
from keras.models import load_model
import numpy as np
from gtts import gTTS 
from playsound import playsound
import os

# Load the trained model
model = load_model("STS.h5")

# Define the labels dictionary
labels_dict = {i: chr(65+i) for i in range(26)}
labels_dict[0] = ' '

# Define color for the rectangle around the sign
color_dict = (0, 255, 0)

# Define image size and threshold value
img_size = 128
minValue = 70

# Initialize video capture
source = cv2.VideoCapture(0)

# Initialize variables for sign detection
count = 0
word = ""
prev = " "
debounce_threshold = 5
debounce_counter = 0
current_label = None

# Function to preprocess the image
def preprocess_image(crop_img):
    blur = cv2.GaussianBlur(crop_img, (5, 5), 2)
    th3 = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
    ret, res = cv2.threshold(th3, minValue, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    resized = cv2.resize(res, (img_size, img_size))
    normalized = resized / 255.0
    reshaped = np.reshape(normalized, (1, img_size, img_size, 1))
    return reshaped

# Main loop for sign detection
while(True):
    ret, img = source.read()
    if not ret:
        break

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    cv2.rectangle(img, (24, 24), (250, 250), color_dict, 2)
    crop_img = gray[24:250, 24:250]

    # Preprocess the image
    reshaped = preprocess_image(crop_img)

    # Predict the sign every 20 frames to reduce computational load
    if count % 20 == 0:
        result = model.predict(reshaped)
        label = np.argmax(result, axis=1)[0]

        # Debounce mechanism to avoid flickering between signs
        if label == current_label:
            debounce_counter += 1
            if debounce_counter >= debounce_threshold:
                prev = labels_dict[label]
                if label != 0:  # Avoid adding space repeatedly
                    word += prev
        else:
            debounce_counter = 0
        current_label = label

    # Display the current sign and the accumulated word
    cv2.putText(img, prev, (24, 14), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2) 
    cv2.putText(img, word, (275, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (200, 200, 200), 2)
    cv2.imshow('LIVE', img)
    cv2.imshow("Gray", crop_img)  # Display the grayscale image
    key = cv2.waitKey(1)

    if key == 27:  # Press Esc to exit
        break

    count += 1

# Speech synthesis after finishing showing signs
language = 'en'
myobj = gTTS(text=word, lang=language, slow=False)
speech_file = "output_speech.mp3"
myobj.save(speech_file)
playsound(speech_file)

# Cleanup
cv2.destroyAllWindows()
source.release()
os.remove(speech_file)  # Remove the speech file after playing

# Print the final word
print(f"The final word is: {word}")


  trackable.load_own_variables(weights_store.get(inner_path))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3