In [None]:
# @title
!pip install gradio google-generativeai mediapipe opencv-python gtts googletrans==4.0.0-rc1
!pip install gradio mediapipe opencv-python pillow numpy pandas tensorflow joblib gtts googletrans==4.0.0-rc1 edge-tts google-generativeai

In [1]:
import cv2
import os
import numpy as np
import pandas as pd
import mediapipe as mp
from tensorflow.keras.models import load_model
import joblib
from PIL import Image
import google.generativeai as genai
import re
import asyncio
from gtts import gTTS
from googletrans import Translator
import edge_tts

def load_models_and_setup():
    """Load the sign language model and setup MediaPipe"""
    print("🔄 Loading models and setting up MediaPipe...")

    # Load model and label encoder
    model = load_model("/content/Model (3).h5")
    label_encoder = joblib.load("/content/Model (3).pkl")

    # Setup MediaPipe
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(
        static_image_mode=True,
        max_num_hands=2,
        min_detection_confidence=0.5
    )

    print("✅ Models loaded successfully!")
    return model, label_encoder, hands

def process_images(model, label_encoder, hands, image_folder, output_csv_path):
    """Process all images in the folder and predict sign language"""
    print("🔄 Processing sign language images...")

    results = []

    # Loop through all images
    for filename in os.listdir(image_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
            image_path = os.path.join(image_folder, filename)

            # Read image and convert to RGB
            image = cv2.imread(image_path)
            if image is None:
                print(f"❌ Could not read image: {filename}")
                continue

            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            result = hands.process(image_rgb)

            # Check if hands were detected
            if result.multi_hand_landmarks:
                keypoints = []

                for hand_landmarks in result.multi_hand_landmarks:
                    for lm in hand_landmarks.landmark:
                        keypoints.extend([lm.x, lm.y, lm.z])

                # If only one hand, pad to 126
                if len(result.multi_hand_landmarks) == 1:
                    keypoints.extend([-1.0] * 63)

                if len(keypoints) == 126:
                    X_input = np.array(keypoints).reshape(1, -1)
                    prediction = model.predict(X_input)
                    predicted_index = np.argmax(prediction)
                    predicted_class = label_encoder.inverse_transform([predicted_index])[0]
                    confidence = float(np.max(prediction))

                    results.append({
                        "filename": filename,
                        "predicted_class": predicted_class,
                        "confidence": confidence
                    })
                else:
                    print(f"Skipped {filename}: Invalid keypoints length = {len(keypoints)}")
            else:
                print(f"Skipped {filename}: No hands detected")

    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_csv_path, index=False)

    print(f"✅ Predictions saved to {output_csv_path}")
    return df

def generate_sentence_from_predictions(df, api_key):
    """Generate a natural sentence from predicted sign language words"""
    print("🔄 Generating sentence from predictions...")

    # Configure Gemini API
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("models/gemini-1.5-flash-latest")

    # Collect all predicted classes
    predicted_words = df["predicted_class"].tolist()
    words = ", ".join(predicted_words)

    prompt = (
        f"You are an emotionally intelligent sentence generator.\n"
        f"Use the following 3–5 words as inspiration: {words}.\n"
        f"Generate one short, natural-sounding sentence (8–12 words max).\n"
        f"Generate ONE and ONLY ONE grammatically correct sentence (8–12 words max).\n"
        f"Do not include explanations, multiple options, or repeated outputs.\n"
        f"End with appropriate punctuation (., ?, !). Return only the sentence.\n"
        f"Understand the intent behind the words, not just their surface form.\n"
        f"Choose the correct sentence style: request, statement, or question.\n"
        f"Be grammatically correct and use appropriate punctuation (!, ?, or .)\n"
        f"Examples:\n"
        f"  Words: Love Family → Sentence: I really love my family.\n"
        f"  Words: Eat Food Now → Sentence: Can we eat some food now?\n"
        f"  Words: Please Water → Sentence: Please give me some water.\n"
        f"  Words: You Where → Sentence: Where are you?\n"
        f"  Words: Smile Beautiful → Sentence: Your smile is beautiful!\n"
        f"Now generate the sentence:"
    )

    try:
        response = model.generate_content(prompt)
        text = response.text.strip()

        # Extract the first valid sentence
        sentences = re.findall(r'[^.!?]*[.!?]', text)
        sentence = sentences[0].strip() if sentences else text

        # Save sentence to file
        with open("/content/generated_sentences.txt", "w") as f:
            f.write(sentence + "\n")

        print(f"✅ Sentence generated: {sentence}")
        return sentence

    except Exception as e:
        print(f"❌ Error generating sentence: {e}")
        return "ERROR"

async def generate_audio_files(sentence):
    """Generate audio files in both English and Chinese"""
    print("🔄 Generating audio files...")

    # Step 1: Translate to Chinese
    translator = Translator()
    translation = translator.translate(sentence, src="en", dest="zh-cn")
    chinese_text = translation.text

    print(f"🌐 Translation: {chinese_text}")

    # Step 2: Create output folder
    output_folder = "output_audio"
    os.makedirs(output_folder, exist_ok=True)

    # Step 3: Generate English audio using gTTS
    english_audio_path = os.path.join(output_folder, "sentence_en.mp3")
    gTTS(text=sentence, lang="en").save(english_audio_path)
    print(f"✅ English audio saved to: {english_audio_path}")

    # Step 4: Generate Chinese audio using edge-tts
    chinese_audio_path = os.path.join(output_folder, "sentence_zh.mp3")
    communicate = edge_tts.Communicate(text=chinese_text, voice="zh-CN-XiaoxiaoNeural")
    await communicate.save(chinese_audio_path)
    print(f"✅ Chinese audio saved to: {chinese_audio_path}")

    return english_audio_path, chinese_audio_path

async def main():
    """Main function to run the complete pipeline"""
    print("🚀 Starting Sign Language Processing Pipeline\n")

    # Configuration
    IMAGE_FOLDER = "/content/testimages"
    OUTPUT_CSV_PATH = "/content/prediction.csv"
    GEMINI_API_KEY = "AIzaSyC4h_QLZOZMUzQRzemTcwPSfjdBQO1I2Ac"

    try:
        # Step 1: Load models and setup MediaPipe
        model, label_encoder, hands = load_models_and_setup()

        # Step 2: Process images and predict sign language
        df = process_images(model, label_encoder, hands, IMAGE_FOLDER, OUTPUT_CSV_PATH)

        if df.empty:
            print("❌ No valid predictions found. Exiting...")
            return

        print(f"\n📊 Predictions Summary:")
        print(df.head())

        # Step 3: Generate sentence from predictions
        sentence = generate_sentence_from_predictions(df, GEMINI_API_KEY)

        if sentence == "ERROR":
            print("❌ Failed to generate sentence. Exiting...")
            return

        # Step 4: Generate audio files
        english_path, chinese_path = await generate_audio_files(sentence)

        print(f"\n🎉 Pipeline completed successfully!")
        print(f"📝 Generated sentence: {sentence}")
        print(f"🔊 English audio: {english_path}")
        print(f"🔊 Chinese audio: {chinese_path}")

    except Exception as e:
        print(f"❌ Pipeline error: {e}")

# Run the pipeline
if __name__ == "__main__":
    # Handle both Jupyter and regular Python environments
    try:
        # Check if we're in a Jupyter notebook
        if asyncio.get_event_loop().is_running():
            # In Jupyter, create a new event loop in a thread
            import nest_asyncio
            nest_asyncio.apply()
            asyncio.run(main())
        else:
            # In regular Python environment
            asyncio.run(main())
    except RuntimeError:
        # Fallback for different environments
        asyncio.run(main())



🚀 Starting Sign Language Processing Pipeline

🔄 Loading models and setting up MediaPipe...
✅ Models loaded successfully!
🔄 Processing sign language images...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
✅ Predictions saved to /content/prediction.csv

📊 Predictions Summary:
             filename predicted_class  confidence
0  Water_0616 (1).jpg           Water    0.999494
1         Me_1145.jpg              Me    0.999987
2     Please_0344.jpg          Please    0.999980
🔄 Generating sentence from predictions...
✅ Sentence generated: Please, give me some water.
🔄 Generating audio files...
🌐 Translation: 请给我一些水。
✅ English audio saved to: output_audio/sentence_en.mp3
✅ Chinese audio saved to: output_audio/sentence_zh.mp3

🎉 Pipeline completed successfully!
📝 Generated sentence: Please, give me some water.
🔊 English audio