In [None]:
import cv2
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pyttsx3
import os
from playsound import playsound
import time
import uuid

# Load model and labels
model = load_model('fullset.h5')
class_labels = [
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    'A', 'Alright', 'Animal', 'B', 'Beautiful', 'Bed', 'Bedroom', 'Bird', 'Black', 'Blind',
    'C', 'Cat', 'Chair', 'Colour', 'Cow', 'D', 'Daughter', 'Deaf', 'Dog', 'Door', 'Dream',
    'E', 'F', 'Father', 'Fish', 'Friday', 'G', 'Good Morning', 'Good night', 'Grey',
    'H', 'Happy', 'He', 'Hello', 'Horse', 'How are you', 'I','I', 'It',
    'J', 'K', 'L', 'Loud', 'M', 'Monday', 'Mother', 'Mouse',
    'N', 'O', 'Orange', 'P', 'Parent', 'Pink', 'Pleased',
    'Q', 'Quiet', 'R', 'S', 'Sad', 'Saturday', 'She', 'Son', 'Sunday',
    'T', 'Table', 'Thank you', 'Thursday', 'Today', 'Tuesday',
    'U', 'Ugly', 'V', 'W', 'Wednesday', 'White', 'Window',
    'X', 'Y', 'You', 'Z'
]

# Load tokenizer and sentence model
tokenizer = T5Tokenizer.from_pretrained('./flan-t5-customm')
sentence_model = T5ForConditionalGeneration.from_pretrained('./flan-t5-customm')

# MediaPipe setup
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2,
                       min_detection_confidence=0.7, min_tracking_confidence=0.7)

SEQUENCE_LENGTH = 30


def generate_sentence(word_list):
    input_text = (
        "form a valid and grammatically correct sentence using the following words only once with proper structure and verb form: "
        + ", ".join(word_list)
    )
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = sentence_model.generate(input_ids=input_ids, max_length=30,
                                      num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def speak(sentence):
    os.makedirs("tts_output", exist_ok=True)
    filename = os.path.join("tts_output", f"{uuid.uuid4().hex}.wav")
    engine = pyttsx3.init()
    engine.setProperty('rate', 150)
    engine.save_to_file(sentence, filename)
    engine.runAndWait()
    playsound(filename, block=True)

    


def extract_keypoints(frame):
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(image)
    keypoints = []

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks[:2]:
            for lm in hand_landmarks.landmark:
                keypoints.extend([lm.x, lm.y, lm.z])
        if len(results.multi_hand_landmarks) == 1:
            keypoints.extend([0] * 63)
    else:
        keypoints = [0] * 126
    return keypoints, results


def is_fist(results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            lm = hand_landmarks.landmark
            tips = [8, 12, 16, 20]
            mids = [6, 10, 14, 18]
            if all(lm[tip].y > lm[mid].y for tip, mid in zip(tips, mids)):
                return True
    return False


print("Show LEFT FIST to START, then again to STOP.")
exit_flag = False

while not exit_flag:
    sentence = []
    cap = cv2.VideoCapture(0)

    # === WAIT FOR START FIST ===
    while True:
        ret, frame = cap.read()
        if not ret:
            exit_flag = True
            break

        _, results = extract_keypoints(frame)
        cv2.putText(frame, "Show LEFT FIST to START", (10, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 0), 2)
        cv2.imshow('Sign Prediction', frame)

        key = cv2.waitKey(10) & 0xFF
        if key == ord('s'):
            print("Start detected via 's' key!")
            time.sleep(1)
            break
        elif key == ord('q'):
            exit_flag = True
            break

    if exit_flag:
        cap.release()
        cv2.destroyAllWindows()
        break

    # === CAPTURE LOOP ===
    while not exit_flag:
        # Countdown
        for count in [3, 2, 1]:
            ret, frame = cap.read()
            if not ret:
                exit_flag = True
                break
            cv2.putText(frame, f'Starting in {count}', (100, 100),
                        cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 4)
            cv2.imshow('Sign Prediction', frame)
            if cv2.waitKey(1000) & 0xFF == ord('q'):
                exit_flag = True
                break

        if exit_flag:
            break

        # Sequence capture
        sequence = []
        while len(sequence) < SEQUENCE_LENGTH and not exit_flag:
            ret, frame = cap.read()
            if not ret:
                exit_flag = True
                break
            keypoints, _ = extract_keypoints(frame)
            sequence.append(keypoints)
            cv2.putText(frame, f'Capturing {len(sequence)}/{SEQUENCE_LENGTH}', (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 0), 2)
            cv2.imshow('Sign Prediction', frame)
            if cv2.waitKey(50) & 0xFF == ord('q'):
                exit_flag = True
                break

        if exit_flag:
            break

        # Prediction
        if len(sequence) == SEQUENCE_LENGTH:
            time_indices = np.linspace(0, 1, SEQUENCE_LENGTH).reshape(SEQUENCE_LENGTH, 1)
            sequence_with_time = np.concatenate([sequence, time_indices], axis=1)
            input_seq = np.expand_dims(sequence_with_time, axis=0)
            prediction = model.predict(input_seq)[0]
            predicted_class = class_labels[np.argmax(prediction)]
            sentence.append(predicted_class)

            cv2.putText(frame, f'{predicted_class}', (10, 60),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0, 255, 0), 2)
            cv2.imshow('Sign Prediction', frame)
            cv2.waitKey(2000)
        else:
            print("Sequence interrupted or incomplete. Skipping prediction.")

        # Check STOP FIST
        stop = False
        for _ in range(30):
            ret, frame = cap.read()
            if not ret:
                exit_flag = True
                break
            _, results = extract_keypoints(frame)
            if is_fist(results):
                print("Stop detected!")
                stop = True
                break
            cv2.putText(frame, "Show FIST to STOP", (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)
            cv2.imshow('Sign Prediction', frame)
            if cv2.waitKey(30) & 0xFF == ord('q'):
                exit_flag = True
                break

        if stop or exit_flag:
            break

    # === AFTER STOP ===
    cap.release()
    cv2.destroyAllWindows()

    if exit_flag:
        break

    if sentence:
        full_sentence = generate_sentence(sentence)
        print("\nGenerated Sentence:", full_sentence)
        speak(full_sentence)
    else:
        print("\nNo signs were detected.")

# === Final Cleanup ===
cv2.destroyAllWindows()
print("Exited cleanly.")


Show LEFT FIST to START, then again to STOP.
Start detected via 's' key!
Exited cleanly.
