#CNN-based Sign to text/speech conversion

In [None]:
# Step 1: Install Necessary Libraries
!pip install mediapipe tensorflow gtts opencv-python matplotlib numpy

# Step 2: Import Libraries
import os
import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from gtts import gTTS
from IPython.display import Audio, display
import matplotlib.pyplot as plt
from google.colab import files

# Step 3: Initialize MediaPipe
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)

# Step 4: Define Functions for Keypoint Extraction
def extract_keypoints(image):
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    result = hands.process(image_rgb)
    if result.multi_hand_landmarks:
        keypoints = []
        for hand_landmarks in result.multi_hand_landmarks:
            for lm in hand_landmarks.landmark:
                keypoints.extend([lm.x, lm.y, lm.z])
        return keypoints
    else:
        return None

# Step 5: Load Dataset and Extract Features
DATASET_PATH = "/content/sign_language_dataset1"  # Update as needed
data = []
labels = []

for label in os.listdir(DATASET_PATH):
    label_path = os.path.join(DATASET_PATH, label)
    if os.path.isdir(label_path):
        for image_file in os.listdir(label_path):
            image_path = os.path.join(label_path, image_file)
            image = cv2.imread(image_path)
            keypoints = extract_keypoints(image)
            if keypoints:
                data.append(keypoints)
                labels.append(label)

data = np.array(data)
labels = np.array(labels)

# Encode labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data, encoded_labels, test_size=0.2, random_state=42)

# Step 7: Build and Train Deep Learning Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(data.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(encoded_labels)), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_split=0.2, epochs=1000, callbacks=[early_stop], batch_size=32)

# Plot Training History
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.show()

# Step 8: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Step 9: Save the Model and Encoder
model.save('sign_language_model.h5')
np.save('label_encoder_classes.npy', encoder.classes_)

# Step 10: Functions for Prediction and TTS
def predict_sign_language(keypoints):
    if keypoints is not None:
        keypoints = np.array(keypoints).reshape(1, -1)
        prediction = model.predict(keypoints)
        predicted_label = np.argmax(prediction)
        sign = encoder.inverse_transform([predicted_label])[0]
        return sign
    return None

def text_to_speech(text):
    tts = gTTS(text)
    tts.save("output.mp3")
    display(Audio("output.mp3", autoplay=True))
# Function to visualize MediaPipe processed image with landmarks
def visualize_hand(image, landmarks):
    """
    Draws hand landmarks on the image and returns it.
    """
    annotated_image = image.copy()
    for hand_landmarks in landmarks:
        mp_drawing.draw_landmarks(
            annotated_image, hand_landmarks, mp_hands.HAND_CONNECTIONS
        )
    return annotated_image

# Update `test_uploaded_image` function to include visualization
def test_uploaded_image():
    uploaded = files.upload()
    for filename in uploaded.keys():
        image_path = filename
        image = cv2.imread(image_path)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        result = hands.process(image_rgb)

        if result.multi_hand_landmarks:
            # Display MediaPipe hand visualization
            processed_image = visualize_hand(image, result.multi_hand_landmarks)
            plt.imshow(cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB))
            plt.axis('off')
            plt.title("MediaPipe Processed Image")
            plt.show()

            # Extract keypoints
            keypoints = []
            for hand_landmarks in result.multi_hand_landmarks:
                for lm in hand_landmarks.landmark:
                    keypoints.extend([lm.x, lm.y, lm.z])

            # Predict the gesture
            prediction = predict_sign_language(keypoints)
            if prediction:
                print(f"Detected Sign: {prediction}")
                text_to_speech(prediction)
            else:
                print("No valid sign detected!")
        else:
            print("No hand detected!")

# Update `real_time_detection` function to include visualization
def real_time_detection():
    cap = cv2.VideoCapture(0)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = hands.process(image_rgb)

        if result.multi_hand_landmarks:
            # Extract keypoints
            keypoints = []
            for hand_landmarks in result.multi_hand_landmarks:
                for lm in hand_landmarks.landmark:
                    keypoints.extend([lm.x, lm.y, lm.z])

            # Predict the gesture
            prediction = predict_sign_language(keypoints)

            # Display MediaPipe hand visualization in a separate window
            processed_image = visualize_hand(frame, result.multi_hand_landmarks)
            cv2.imshow("MediaPipe Processed Image", processed_image)

            if prediction:
                # Display prediction on the original frame
                cv2.putText(frame, f"Detected: {prediction}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

                # Text-to-Speech Conversion
                text_to_speech(prediction)

        cv2.imshow('Sign Language Detection', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Test Functions
# test_uploaded_image()
# real_time_detection()

In [None]:
#Test Functions
test_uploaded_image()
