In [1]:
import numpy as np
import os
import cv2
import mediapipe as mp
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Set up Mediapipe for hand detection
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7, min_tracking_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils


In [2]:
# Function to create dataset
def create_dataset(output_dir, labels):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    cap = cv2.VideoCapture(0)
    for label in labels:
        label_dir = os.path.join(output_dir, label)
        if not os.path.exists(label_dir):
            os.makedirs(label_dir)
        print(f"Collecting images for {label}. Press 'q' to quit.")
        img_count = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                print("Failed to capture image")
                break
            frame = cv2.flip(frame, 1)
            cv2.putText(frame, f"Label: {label} | Images: {img_count}", (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
            cv2.imshow('Dataset Creation', frame)
            k = cv2.waitKey(1)
            if k == ord('q'):
                break
            elif k == ord(' '):
                img_count += 1
                img_path = os.path.join(label_dir, f"{label}_{img_count}.jpeg")
                cv2.imwrite(img_path, frame)
    cap.release()
    cv2.destroyAllWindows()


In [4]:
# Function to load data
def load_data(data_dir):
    images = []
    labels = []
    label_dict = {}
    class_index = 0

    for label in os.listdir(data_dir):
        label_dir = os.path.join(data_dir, label)
        if os.path.isdir(label_dir):
            label_dict[label] = class_index
            for image_file in os.listdir(label_dir):
                if image_file.lower().endswith('.jpeg'):
                    image_path = os.path.join(label_dir, image_file)
                    image = cv2.imread(image_path)
                    if image is not None:
                        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                        image = cv2.resize(image, (64, 64))
                        images.append(image)
                        labels.append(class_index)
            class_index += 1

    return np.array(images), np.array(labels), label_dict


In [5]:
# Create dataset
output_dir = 'sign_language'  # Directory to save the dataset
labels = ['hello', 'thank_you', 'yes', 'no']  # Add your labels here
create_dataset(output_dir, labels)

# Load data
images, labels, label_dict = load_data(output_dir)


Collecting images for hello. Press 'q' to quit.
Collecting images for thank_you. Press 'q' to quit.
Collecting images for yes. Press 'q' to quit.
Collecting images for no. Press 'q' to quit.


In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

# Normalize images
X_train, X_test = X_train / 255.0, X_test / 255.0

# Convert labels to categorical
y_train = to_categorical(y_train, num_classes=len(label_dict))
y_test = to_categorical(y_test, num_classes=len(label_dict))

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

datagen.fit(X_train)


In [31]:
# Fine-tune a pre-trained model
base_model = tf.keras.applications.MobileNetV2(input_shape=(64, 64, 3),
                                               include_top=False,
                                               weights='imagenet')

base_model.trainable = False

model = tf.keras.models.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(label_dict), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(datagen.flow(X_train, y_train, batch_size=32),
                    steps_per_epoch=len(X_train) // 32,
                    epochs=150,
                    validation_data=(X_test, y_test))

# Save the model
model.save('hand_gesture_model.h5')


Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [36]:
# Invert label dictionary for prediction
label_dict_inv = {v: k for k, v in label_dict.items()}

# Real-time detection
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Failed to capture image")
        break
    
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)
    
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Extract hand region
            h, w, _ = frame.shape
            x_min, y_min = w, h
            x_max, y_max = 0, 0
            for lm in hand_landmarks.landmark:
                x, y = int(lm.x * w), int(lm.y * h)
                x_min = min(x_min, x)
                y_min = min(y_min, y)
                x_max = max(x_max, x)
                y_max = max(y_max, y)
            
            # Add padding
            padding = 20
            x_min = max(0, x_min - padding)
            y_min = max(0, y_min - padding)
            x_max = min(w, x_max + padding)
            y_max = min(h, y_max + padding)
            
            # Crop and resize hand region
            hand_img = rgb_frame[y_min:y_max, x_min:x_max]
            if hand_img.size != 0:
                hand_img = cv2.resize(hand_img, (64, 64))
                
                # Preprocess for model
                input_data = np.expand_dims(hand_img, axis=0) / 255.0
                
                # Make prediction
                prediction = model.predict(input_data)
                predicted_label = label_dict_inv[np.argmax(prediction)]
                confidence = np.max(prediction)
                
                # Only display high confidence predictions
                if confidence > 0.7:
                    gesture = predicted_label.upper()
                else:
                    gesture = "UNKNOWN"
                
                # Display the prediction and confidence
                text = f"Gesture: {gesture} ({confidence:.2f})"
                (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
                cv2.rectangle(frame, (10, 10), (20 + text_width, 50 + text_height), (0, 255, 0), -1)
                cv2.putText(frame, text, (20, 50), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            else:
                cv2.rectangle(frame, (10, 10), (300, 70), (0, 0, 255), -1)
                cv2.putText(frame, "Hand too close to edge", (20, 50), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, cv2.LINE_AA)
    else:
        # Display "No hand detected" when no hand is in the frame
        cv2.rectangle(frame, (10, 10), (300, 70), (0, 0, 255), -1)
        cv2.putText(frame, "No hand detected", (20, 50), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
    
    cv2.imshow('Hand Gesture Recognition', frame)
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


