In [None]:
# Install necessary libraries
!pip install SpeechRecognition moviepy librosa tensorflow keras

import os
import re
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from moviepy.editor import VideoFileClip
from IPython.display import Video, display
import speech_recognition as sr
from IPython.display import HTML


# Set dataset path
DATASET_PATH = "/content/sign_language_dataset2"

# Step 1: Load dataset and prepare labels
gesture_videos = {os.path.splitext(f)[0].lower(): os.path.join(DATASET_PATH, f)
                  for f in os.listdir(DATASET_PATH) if f.endswith('.mp4')}
labels = list(gesture_videos.keys())

# Label encoding for gestures
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Step 2: Feature extraction for video (using placeholder features)
def extract_video_features(video_path):
    # For simplicity, using video length as a feature. Extendable to frame-based features.
    clip = VideoFileClip(video_path)
    duration = clip.duration
    clip.close()
    return np.array([duration])

# Build dataset features and labels
video_features = np.array([extract_video_features(path) for path in gesture_videos.values()])
video_labels = np.array(encoded_labels)

# Step 3: Train ML/DL model for text/speech to video mapping
# Convert labels to one-hot encoding
one_hot_labels = tf.keras.utils.to_categorical(video_labels, num_classes=len(labels))

# Define a simple LSTM model
model = Sequential([
    Embedding(input_dim=len(labels), output_dim=64, input_length=1),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dense(len(labels), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training the model
model.fit(video_features, one_hot_labels, epochs=1000, batch_size=4)

# Step 4: Map input speech/text to video
def preprocess_text(input_text):
    return re.sub(r'[^a-zA-Z0-9 ]', '', input_text).strip().lower()

def predict_video(input_text):
    label = preprocess_text(input_text)
    if label in labels:
        encoded_input = np.array([label_encoder.transform([label])[0]])
        prediction = model.predict(encoded_input)
        predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
        return gesture_videos.get(predicted_label[0])
    else:
        return None

def play_video(video_path):
    if video_path:
        display(Video(video_path, embed=True))
    else:
        print("No corresponding gesture found.")


# Step 5: Speech-to-text conversion
def speech_to_text(audio_path=None):
    recognizer = sr.Recognizer()
    if audio_path:
        with sr.AudioFile(audio_path) as source:
            audio_data = recognizer.record(source)
    else:
        with sr.Microphone() as source:
            print("Speak now...")
            audio_data = recognizer.listen(source)
    try:
        return recognizer.recognize_google(audio_data)
    except sr.UnknownValueError:
        print("Could not understand audio.")
        return None
    except sr.RequestError:
        print("Service unavailable.")
        return None

# Step 6: Full pipeline
def convert_to_sign_language(input_text=None, audio_path=None):
    if not input_text and not audio_path:
        print("Please provide either text or an audio input.")
        return
    if audio_path:
        input_text = speech_to_text(audio_path)
    if input_text:
        print(f"Input: {input_text}")
        video_path = predict_video(input_text)
        play_video(video_path)

    else:
        print("Unable to process input.")

# Example Usage
# Text Input
#convert_to_sign_language(input_text="hello")

# Speech Input (Use an audio file path or microphone)
# convert_to_sign_language(audio_path="/path/to/audio.wav")


In [None]:
#convert_to_sign_language(input_text="alone")

In [None]:
def text_to_sign_language(text):
    processed_text = preprocess_text(text)
    words = processed_text.split()

    print(f"Processing text: {processed_text}")
    for word in words:
        gesture_index = label_map.get(word.lower())
        if gesture_index is not None:
            video_path = train_videos[gesture_index]
            cap = cv2.VideoCapture(video_path)
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                cv2.imshow('Sign Language Gesture', frame)
                if cv2.waitKey(30) & 0xFF == ord('q'):  # Press 'q' to quit
                    break
            cap.release()
            cv2.destroyAllWindows()
        else:
            print(f"No gesture found for: {word}")


In [None]:
def real_time_system():
    mode = input("Choose mode (1 for Speech, 2 for Text): ")
    if mode == "1":
        # text = speech_to_text()
        print("still in processing phase")
    elif mode == "2":
        text = input("Enter text: ")
    else:
        print("Invalid mode!")
        return

    if text:
        convert_to_sign_language(input_text=text)


In [None]:
real_time_system()
