In [4]:
import cv2
import numpy as np
import threading
from gtts import gTTS
import pygame
import os
import time
import uuid
from tensorflow.keras.models import load_model
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
import tkinter as tk
from PIL import Image, ImageTk

In [5]:
GESTURE_CLASSES = ['Dislike', 'Fist', 'Hello', 'Like', 'Peace', 'Stop']
ASL_CLASSES = [
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
    'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
    'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space'
]

In [6]:
gesture_model = load_model("mobilenetv2_gesture_model.keras", compile=False)
asl_model = load_model("asl_mobilenet_model.keras", compile=False)

In [7]:
last_spoken = ""
last_spoken_time = 0
speak_cooldown = 2  
prediction_buffer = []
buffer_size = 7



In [8]:
def speak_now(text):
    try:
        filename = f"speech_{uuid.uuid4().hex}.mp3" 
        tts = gTTS(text=text, lang='en')
        tts.save(filename)

        pygame.mixer.init()
        pygame.mixer.music.load(filename)
        pygame.mixer.music.play()

        while pygame.mixer.music.get_busy():
            time.sleep(0.1)

        pygame.mixer.music.unload()
        os.remove(filename)

    except Exception as e:
        print("[TTS] Error:", e)


In [9]:
def start_camera():
    global cap, running
    cap = cv2.VideoCapture(0)
    running = True
    show_frame()


In [10]:
def stop_camera():
    global running
    running = False
    if cap:
        cap.release()


In [11]:
def show_frame():
    global last_spoken, last_spoken_time

    if not running:
        return

    ret, frame = cap.read()
    if not ret:
        return

    frame = cv2.flip(frame, 1)
    roi = frame[100:400, 100:400]
    cv2.rectangle(frame, (100, 100), (400, 400), (255, 0, 0), 2)

    
    roi_resized = cv2.resize(roi, (224, 224))
    roi_input = preprocess_input(roi_resized.astype('float32'))
    roi_input = np.expand_dims(roi_input, axis=0)

    
    gesture_pred = gesture_model.predict(roi_input, verbose=0)[0]
    asl_pred = asl_model.predict(roi_input, verbose=0)[0]

    gesture_label = GESTURE_CLASSES[np.argmax(gesture_pred)]
    asl_label = ASL_CLASSES[np.argmax(asl_pred)]

    gesture_conf = np.max(gesture_pred)
    asl_conf = np.max(asl_pred)

    final_gesture = gesture_label if gesture_conf > 0.7 else "None"
    final_asl = asl_label if asl_conf > 0.7 else "None"

    
    gesture_var.set(f"Gesture: {final_gesture}")
    asl_var.set(f"ASL: {final_asl}")

    
    prediction_buffer.append((final_asl, final_gesture, asl_conf, gesture_conf))
    if len(prediction_buffer) > buffer_size:
        prediction_buffer.pop(0)

    
    asl_labels = [p[0] for p in prediction_buffer]
    gesture_labels = [p[1] for p in prediction_buffer]
    asl_confs = [p[2] for p in prediction_buffer]
    gesture_confs = [p[3] for p in prediction_buffer]

    stable_asl = asl_labels.count(asl_labels[-1]) == buffer_size and asl_labels[-1] not in ['nothing', 'None']
    stable_gesture = gesture_labels.count(gesture_labels[-1]) == buffer_size and gesture_labels[-1] != "None"

    avg_asl_conf = sum(asl_confs) / len(asl_confs)
    avg_gesture_conf = sum(gesture_confs) / len(gesture_confs)

    current_time = time.time()

        
    stable_asl = asl_labels.count(asl_labels[-1]) == buffer_size and asl_labels[-1] not in ['nothing', 'None']
    stable_gesture = gesture_labels.count(gesture_labels[-1]) == buffer_size and gesture_labels[-1] != "None"

    avg_asl_conf = sum(asl_confs) / len(asl_confs)
    avg_gesture_conf = sum(gesture_confs) / len(gesture_confs)

    current_time = time.time()
       
    predicted_text = None

    similar_pairs = {
        "Peace": "V",
        "Fist": "S"
    }

    gesture = gesture_labels[-1]
    asl = asl_labels[-1]

    is_similar_pair = (
        (gesture in similar_pairs and similar_pairs[gesture] == asl) or
        (asl in similar_pairs.values() and any(k for k, v in similar_pairs.items() if v == asl and k == gesture))
    )

    if stable_asl and stable_gesture and is_similar_pair and avg_asl_conf > 0.75 and avg_gesture_conf > 0.75:
        predicted_text = f"{gesture} and letter {asl}"

    elif stable_gesture and avg_gesture_conf > 0.75:
        predicted_text = gesture
    elif stable_asl and avg_asl_conf > 0.75:
        predicted_text = asl

    
    
    if predicted_text and (predicted_text != last_spoken or (current_time - last_spoken_time) > 5):
        last_spoken = predicted_text
        last_spoken_time = current_time
        threading.Thread(target=speak_now, args=(predicted_text,), daemon=True).start()
        
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    imgtk = ImageTk.PhotoImage(image=img)
    video_label.imgtk = imgtk   
    video_label.configure(image=imgtk)  

    video_label.after(10, show_frame)   
    

        


In [12]:

root = tk.Tk()
root.title("ASL + Gesture Recognition")

video_label = tk.Label(root)
video_label.pack()

gesture_var = tk.StringVar()
asl_var = tk.StringVar()

tk.Label(root, textvariable=gesture_var, font=("Helvetica", 14)).pack()
tk.Label(root, textvariable=asl_var, font=("Helvetica", 14)).pack()


confidence_var = tk.StringVar()
tk.Label(root, textvariable=confidence_var, font=("Helvetica", 10)).pack()





In [13]:
def update_confidence():
    if prediction_buffer:
        asl_confs = [p[2] for p in prediction_buffer]
        gesture_confs = [p[3] for p in prediction_buffer]
        avg_asl_conf = sum(asl_confs) / len(asl_confs)
        avg_gesture_conf = sum(gesture_confs) / len(gesture_confs)
        confidence_var.set(f"ASL Conf: {avg_asl_conf:.2f} | Gesture Conf: {avg_gesture_conf:.2f}")
    root.after(500, update_confidence)

update_confidence()


button_frame = tk.Frame(root)
button_frame.pack(pady=10)

tk.Button(button_frame, text="Start Camera", command=start_camera).grid(row=0, column=0, padx=5)
tk.Button(button_frame, text="Stop Camera", command=stop_camera).grid(row=0, column=1, padx=5)

root.protocol("WM_DELETE_WINDOW", lambda: (stop_camera(), root.destroy()))
root.mainloop()