In [1]:
pip install tensorflow opencv-python pillow pyttsx3

Note: you may need to restart the kernel to use updated packages.


In [1]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Constants
IMG_HEIGHT, IMG_WIDTH = 224, 224
BATCH_SIZE = 32

def create_model(num_classes):
    # Load the pre-trained MobileNetV2 model
    base_model = MobileNetV2(weights='imagenet', include_top=False, 
                            input_shape=(IMG_HEIGHT, IMG_WIDTH, 3))
    
    # Freeze the pre-trained layers
    for layer in base_model.layers:
        layer.trainable = False
    
    # Add custom layers
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    predictions = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=predictions)
    
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

def prepare_data(data_dir):
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        validation_split=0.2,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True
    )
    
    train_generator = train_datagen.flow_from_directory(
        data_dir,
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        subset='training'
    )
    
    validation_generator = train_datagen.flow_from_directory(
        data_dir,
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        subset='validation'
    )
    
    return train_generator, validation_generator

def train_model(model, train_generator, validation_generator, epochs=10):
    history = model.fit(
        train_generator,
        validation_data=validation_generator,
        epochs=epochs
    )
    return history

# Training execution
DATA_DIR = r'C:\Users\Smruti Jagtap\Downloads\Data_Set'  # Your data path
MODEL_SAVE_PATH = 'sign_language_model_pretrained.h5'

# Prepare data and train
train_generator, validation_generator = prepare_data(DATA_DIR)
num_classes = len(train_generator.class_indices)
class_names = list(train_generator.class_indices.keys())

# Create and train model
model = create_model(num_classes)
history = train_model(model, train_generator, validation_generator)

# Save the model and class names
model.save(MODEL_SAVE_PATH)
print("Model saved to:", MODEL_SAVE_PATH)
print("Class names:", class_names)

Found 6916 images belonging to 26 classes.
Found 1716 images belonging to 26 classes.
Epoch 1/10


  self._warn_if_super_not_called()


[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m354s[0m 2s/step - accuracy: 0.5691 - loss: 1.6005 - val_accuracy: 0.7098 - val_loss: 0.9427
Epoch 2/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 1s/step - accuracy: 0.9359 - loss: 0.2187 - val_accuracy: 0.7762 - val_loss: 0.7805
Epoch 3/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 994ms/step - accuracy: 0.9579 - loss: 0.1462 - val_accuracy: 0.7955 - val_loss: 0.7202
Epoch 4/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 985ms/step - accuracy: 0.9713 - loss: 0.0992 - val_accuracy: 0.8007 - val_loss: 0.7597
Epoch 5/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 977ms/step - accuracy: 0.9733 - loss: 0.0884 - val_accuracy: 0.7756 - val_loss: 0.8191
Epoch 6/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 988ms/step - accuracy: 0.9660 - loss: 0.1084 - val_accuracy: 0.7815 - val_loss: 0.8670
Epoch 7/10
[1m217/21



Model saved to: sign_language_model_pretrained.h5
Class names: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [1]:
import tkinter as tk
from tkinter import ttk, filedialog
import cv2
from PIL import Image, ImageTk
import tensorflow as tf
import numpy as np
import pyttsx3
import threading
import os
import json

# Constants
IMG_HEIGHT, IMG_WIDTH = 224, 224

class SignLanguageGUI:
    def __init__(self, window, model_path, class_names):
        self.window = window
        self.window.title("Sign Language Recognition")
        self.window.geometry("1000x700")
        
        # Initialize detector
        self.detector = SignLanguageDetector(model_path, class_names)
        
        # Variables to store text
        self.recognized_text = tk.StringVar()
        self.recognized_text.set("")
        self.current_detection = tk.StringVar()
        self.current_detection.set("No gesture detected")
        
        # Create and set up the GUI
        self.setup_gui()
        
        # Initialize video capture
        self.cap = cv2.VideoCapture(0)
        
        # Start video stream
        self.update_video()

    def setup_gui(self):
        # Main frame
        main_frame = ttk.Frame(self.window, padding="10")
        main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
        
        # Left frame for video and text
        left_frame = ttk.Frame(main_frame)
        left_frame.grid(row=0, column=0, padx=(0, 10))
        
        # Video label
        self.video_label = ttk.Label(left_frame)
        self.video_label.grid(row=0, column=0, pady=(0, 10))
        
        # Current detection display
        current_detection_frame = ttk.Frame(left_frame)
        current_detection_frame.grid(row=1, column=0, sticky=(tk.W, tk.E), pady=(0, 10))
        ttk.Label(current_detection_frame, text="Current Detection: ").grid(row=0, column=0)
        ttk.Label(current_detection_frame, textvariable=self.current_detection).grid(row=0, column=1)
        
        # Text display
        text_frame = ttk.Frame(left_frame, relief=tk.SUNKEN, borderwidth=1)
        text_frame.grid(row=2, column=0, sticky=(tk.W, tk.E))
        text_label = ttk.Label(text_frame, textvariable=self.recognized_text, 
                              font=("Courier", 12), padding="10")
        text_label.grid(sticky=(tk.W, tk.E))
        
        # Right frame for buttons
        right_frame = ttk.Frame(main_frame)
        right_frame.grid(row=0, column=1, sticky=(tk.N))
        
        # Set button
        set_button = ttk.Button(right_frame, text="Set Letter", command=self.set_letter)
        set_button.grid(row=0, column=0, pady=(0, 10), ipadx=20, ipady=30)
        
        # Space button
        space_button = ttk.Button(right_frame, text="Add Space", command=self.add_space)
        space_button.grid(row=1, column=0, pady=(0, 10), ipadx=20, ipady=30)
        
        # Cancel last letter button
        cancel_button = ttk.Button(right_frame, text="Cancel Last", command=self.cancel_last)
        cancel_button.grid(row=2, column=0, pady=(0, 10), ipadx=20, ipady=30)
        
        # Speak button
        speak_button = ttk.Button(right_frame, text="Speak", command=self.speak_text)
        speak_button.grid(row=3, column=0, pady=(0, 10), ipadx=20, ipady=30)
        
        # Clear button
        clear_button = ttk.Button(right_frame, text="Clear", command=self.clear_text)
        clear_button.grid(row=4, column=0, pady=(0, 10), ipadx=20, ipady=30)
        
        # Save feedback button
        save_button = ttk.Button(right_frame, text="Save Feedback", command=self.save_feedback)
        save_button.grid(row=5, column=0, ipadx=20, ipady=30)

    def update_video(self):
        ret, frame = self.cap.read()
        if ret:
            frame = cv2.flip(frame, 1)
            
            processed_frame = self.detector.preprocess_frame(frame)
            predictions = self.detector.model.predict(processed_frame)
            predicted_class_index = np.argmax(predictions[0])
            predicted_class = self.detector.class_names[predicted_class_index]
            confidence = predictions[0][predicted_class_index]
            
            if confidence > 0.7:
                self.current_detection.set(f"{predicted_class} ({confidence:.2f})")
            else:
                self.current_detection.set("No gesture detected")
            
            cv2.putText(frame, f"{predicted_class}: {confidence:.2f}", (10, 30), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
            cv2image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img = Image.fromarray(cv2image)
            
            target_width = 640
            aspect_ratio = float(img.size[1]) / float(img.size[0])
            target_height = int(target_width * aspect_ratio)
            img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)
            
            imgtk = ImageTk.PhotoImage(image=img)
            self.video_label.imgtk = imgtk
            self.video_label.configure(image=imgtk)
        
        self.window.after(10, self.update_video)

    def set_letter(self):
        current = self.current_detection.get()
        if current != "No gesture detected":
            letter = current.split()[0]
            current_text = self.recognized_text.get()
            self.recognized_text.set(current_text + letter)

    def add_space(self):
        current_text = self.recognized_text.get()
        self.recognized_text.set(current_text + " ")

    def cancel_last(self):
        current_text = self.recognized_text.get()
        if current_text:
            self.recognized_text.set(current_text[:-1])

    def speak_text(self):
        text = self.recognized_text.get()
        if text:
            threading.Thread(target=self.detector._speak, args=(text,), daemon=True).start()

    def clear_text(self):
        self.recognized_text.set("")

    def save_feedback(self):
        feedback = self.recognized_text.get()
        if feedback:
            file_path = filedialog.asksaveasfilename(defaultextension=".txt")
            if file_path:
                with open(file_path, "w") as file:
                    file.write(feedback)
                print(f"Feedback saved to {file_path}")

    def cleanup(self):
        if self.cap.isOpened():
            self.cap.release()

class SignLanguageDetector:
    def __init__(self, model_path, class_names):
        self.model = tf.keras.models.load_model(model_path)
        self.class_names = class_names
        self.engine = pyttsx3.init()
        
    def preprocess_frame(self, frame):
        resized = cv2.resize(frame, (IMG_HEIGHT, IMG_WIDTH))
        normalized = resized / 255.0
        return np.expand_dims(normalized, axis=0)
    
    def _speak(self, text):
        self.engine.say(text)
        self.engine.runAndWait()

def main():
    MODEL_PATH = 'sign_language_model_pretrained.h5'
    CLASS_NAMES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
                   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    
    root = tk.Tk()
    app = SignLanguageGUI(root, MODEL_PATH, CLASS_NAMES)
    
    root.protocol("WM_DELETE_WINDOW", lambda: [app.cleanup(), root.destroy()])
    
    root.mainloop()

if __name__ == "__main__":
    main()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms

In [2]:
import cv2
import numpy as np
import tensorflow as tf

class SignLanguageDetector:
    def __init__(self, model_path, class_names):
        self.model = tf.keras.models.load_model(model_path)
        self.class_names = class_names

    def preprocess_frame(self, frame):
        resized = cv2.resize(frame, (224, 224))
        normalized = resized / 255.0
        return np.expand_dims(normalized, axis=0)

    def predict(self, frame):
        processed_frame = self.preprocess_frame(frame)
        predictions = self.model.predict(processed_frame)
        predicted_class_index = np.argmax(predictions[0])
        predicted_class = self.class_names[predicted_class_index]
        confidence = predictions[0][predicted_class_index]
        return predicted_class, confidence

In [1]:
import streamlit as st
import cv2
import tensorflow as tf
import numpy as np
from PIL import Image
import pyttsx3
import io
import threading
import os

# Constants
IMG_HEIGHT, IMG_WIDTH = 224, 224

class SignLanguageDetector:
    def __init__(self, model_path, class_names):
        self.model = tf.keras.models.load_model(model_path)
        self.class_names = class_names
        self.engine = pyttsx3.init()
        
    def preprocess_frame(self, frame):
        resized = cv2.resize(frame, (IMG_HEIGHT, IMG_WIDTH))
        normalized = resized / 255.0
        return np.expand_dims(normalized, axis=0)
    
    def predict(self, frame):
        processed_frame = self.preprocess_frame(frame)
        predictions = self.model.predict(processed_frame)
        predicted_class_index = np.argmax(predictions[0])
        predicted_class = self.class_names[predicted_class_index]
        confidence = predictions[0][predicted_class_index]
        return predicted_class, confidence
    
    def speak(self, text):
        self.engine.say(text)
        self.engine.runAndWait()

def main():
    st.title("Sign Language Recognition")

    CLASS_NAMES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
                   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

    # Model loading options
    st.sidebar.header("Model Loading")
    load_option = st.sidebar.radio("Choose how to load the model:", 
                                   ("Upload model file", "Use default path"))

    if load_option == "Upload model file":
        uploaded_file = st.sidebar.file_uploader("Upload your model file", type=['h5'])
        if uploaded_file is not None:
            with open("temp_model.h5", "wb") as f:
                f.write(uploaded_file.getbuffer())
            MODEL_PATH = "temp_model.h5"
        else:
            st.sidebar.warning("Please upload a model file to continue.")
            return
    else:
        MODEL_PATH = 'sign_language_model_pretrained.h5'
        if not os.path.exists(MODEL_PATH):
            st.sidebar.error(f"Model file not found at {MODEL_PATH}. Please upload a model file instead.")
            return

    try:
        detector = SignLanguageDetector(MODEL_PATH, CLASS_NAMES)
    except Exception as e:
        st.error(f"Error loading the model: {str(e)}")
        return

    # Sidebar controls
    st.sidebar.header("Controls")
    start_button = st.sidebar.button("Start Camera")
    stop_button = st.sidebar.button("Stop Camera")

    # Main content
    col1, col2 = st.columns(2)

    with col1:
        st.subheader("Camera Feed")
        image_placeholder = st.empty()

    with col2:
        st.subheader("Detected Gesture")
        gesture_placeholder = st.empty()

    recognized_text = st.empty()
    
    text_input = st.text_input("Recognized Text", value="", key="text_input")

    col3, col4, col5 = st.columns(3)
    with col3:
        if st.button("Add Space"):
            st.session_state.text_input += " "
    with col4:
        if st.button("Cancel Last"):
            st.session_state.text_input = st.session_state.text_input[:-1]
    with col5:
        if st.button("Clear"):
            st.session_state.text_input = ""

    if st.button("Speak"):
        threading.Thread(target=detector.speak, args=(st.session_state.text_input,), daemon=True).start()

    if st.button("Save Feedback"):
        feedback = st.session_state.text_input
        if feedback:
            buffer = io.StringIO()
            buffer.write(feedback)
            st.download_button(
                label="Download feedback",
                data=buffer.getvalue(),
                file_name="feedback.txt",
                mime="text/plain"
            )

    cap = cv2.VideoCapture(0)

    while start_button and not stop_button:
        ret, frame = cap.read()
        if ret:
            frame = cv2.flip(frame, 1)
            
            predicted_class, confidence = detector.predict(frame)
            
            if confidence > 0.7:
                gesture_placeholder.write(f"{predicted_class} ({confidence:.2f})")
                if st.button("Set Letter"):
                    st.session_state.text_input += predicted_class
            else:
                gesture_placeholder.write("No gesture detected")
            
            cv2.putText(frame, f"{predicted_class}: {confidence:.2f}", (10, 30), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
            image_placeholder.image(frame, channels="BGR")
            
        recognized_text.text(st.session_state.text_input)

    cap.release()

    # Clean up temporary file if it was created
    if os.path.exists("temp_model.h5"):
        os.remove("temp_model.h5")

if __name__ == "__main__":
    main()

2024-10-15 21:41:17.666 
  command:

    streamlit run c:\Users\Smruti Jagtap\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-10-15 21:41:17.679 Session state does not function when running a script without `streamlit run`
