Data Collection

Import Essential Libraries

In [None]:
import cv2
import os
import time
import numpy as np
import mediapipe as mp

Parameters

In [None]:
IMG_SIZE = 64
DATASET_PATH = 'sign_language_dataset'  # Root directory for dataset
LABELS = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") 
CAPTURE_DELAY = 0.5  # Time between captures

Set Up Mediapipe

In [None]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

Dataset Collection

In [None]:
for label in LABELS:
    os.makedirs(os.path.join(DATASET_PATH, label), exist_ok=True)

# Initialize webcam
cap = cv2.VideoCapture(0)
print("Press a letter key (A-Z) to start capturing images for that class.")
print("Press ESC to exit.")

last_capture_time = time.time()
current_label = None
capturing = False
count = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_height, frame_width = frame.shape[:2]
    
    # Define Region of Interest (ROI)
    x1, y1 = frame_width - 300, 100
    x2, y2 = frame_width - 100, 300
    roi = frame[y1:y2, x1:x2]
    rgb_roi = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)

    # Draw ROI on frame
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    # Detect hand landmarks
    results = hands.process(rgb_roi)
    white_bg = np.ones_like(roi, dtype=np.uint8) * 255

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                white_bg,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS
            )

        # Capture and save image if under 100 limit
        if capturing and (time.time() - last_capture_time > CAPTURE_DELAY) and count < 100:
            gray = cv2.cvtColor(white_bg, cv2.COLOR_BGR2GRAY)
            resized = cv2.resize(gray, (IMG_SIZE, IMG_SIZE))
            save_path = os.path.join(DATASET_PATH, current_label, f"{current_label}_{count}.png")
            cv2.imwrite(save_path, resized)
            print(f"[INFO] Saved: {save_path}")
            count += 1
            last_capture_time = time.time()

        # Stop capturing once 100 images are reached
        if count >= 100:
            print(f"[INFO] Reached 100 images for '{current_label}'. Stopping capture.")
            capturing = False

    # Display current label
    if current_label:
        cv2.putText(frame, f"Label: {current_label}", (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow("Dataset Collection", frame)

    key = cv2.waitKey(1) & 0xFF

    # Start/continue capturing when A-Z is pressed
    if 65 <= key <= 90:  # ASCII for A-Z
        current_label = chr(key)
        capturing = True
        count = len(os.listdir(os.path.join(DATASET_PATH, current_label)))
        print(f"[INFO] Started capturing for '{current_label}'... Existing: {count}")
    elif key == 27:  # ESC key to exit
        break

cap.release()
cv2.destroyAllWindows()

Model Training

Import Libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical

Load Dataset

In [None]:
DATASET_DIR = 'sign_language_dataset'  # Path to your dataset
IMG_SIZE = 64 
EPOCHS = 10

# Load images and labels
images = []
labels = []

for label in os.listdir(DATASET_DIR):
    label_dir = os.path.join(DATASET_DIR, label)
    if not os.path.isdir(label_dir):
        continue
    for img_file in os.listdir(label_dir):
        img_path = os.path.join(label_dir, img_file)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
            images.append(img)
            labels.append(label)

images = np.array(images)
labels = np.array(labels)

Preprocessing and Train,Test Split

In [None]:
# Normalize and reshape
images = images / 255.0
images = images.reshape(-1, IMG_SIZE, IMG_SIZE, 1)

# Encode labels
lb = LabelBinarizer()
labels_encoded = lb.fit_transform(labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(images, labels_encoded, test_size=0.2, random_state=42)

Build, Train  and Save Model

In [None]:

# Build CNN model
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 1)),
    MaxPooling2D(2,2),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(lb.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train
model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_test, y_test))

# Save model and label binarizer
model.save('sign_model.h5')
import pickle
with open('label_binarizer.pkl', 'wb') as f:
    pickle.dump(lb, f)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.0661 - loss: 3.2905 - val_accuracy: 0.4596 - val_loss: 2.5887
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.4433 - loss: 2.0905 - val_accuracy: 0.9385 - val_loss: 0.4391
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8210 - loss: 0.6741 - val_accuracy: 0.9769 - val_loss: 0.1345
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.8996 - loss: 0.3554 - val_accuracy: 0.9846 - val_loss: 0.1004
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.9345 - loss: 0.2478 - val_accuracy: 0.9885 - val_loss: 0.0564
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.9461 - loss: 0.1840 - val_accuracy: 0.9904 - val_loss: 0.0581
Epoch 7/10
[1m65/65[0m [32m━━━━



Test Prediction

In [17]:
import cv2
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model
import pickle
import time

IMG_SIZE = 64

# Load model and label binarizer
model = load_model('sign_model.h5')
with open('label_binarizer.pkl', 'rb') as f:
    lb = pickle.load(f)

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)

# Open webcam
cap = cv2.VideoCapture(0)

# Word collection state
collecting = False
collected_word = ""
stored_words = []
last_prediction = ""
last_time = time.time()

print("Press 's' to START collecting letters.")
print("Press 'e' to END and show the word.")
print("Press 'q' to QUIT.")

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_height, frame_width = frame.shape[:2]

    # Define two ROIs:
    # Left box (hand must be present here)
    left_x1, left_y1 = 50, 100
    left_x2, left_y2 = 250, 300
    roi_left = frame[left_y1:left_y2, left_x1:left_x2]
    rgb_left = cv2.cvtColor(roi_left, cv2.COLOR_BGR2RGB)

    # Right box (detect sign here)
    right_x1, right_y1 = frame_width - 300, 100
    right_x2, right_y2 = frame_width - 100, 300
    roi_right = frame[right_y1:right_y2, right_x1:right_x2]
    rgb_right = cv2.cvtColor(roi_right, cv2.COLOR_BGR2RGB)

    # Detect hands in left box
    results_left = hands.process(rgb_left)
    hand_in_left = bool(results_left.multi_hand_landmarks)

    # Detect hands in right box
    results_right = hands.process(rgb_right)
    hand_in_right = bool(results_right.multi_hand_landmarks)

    label = "No Hand"
    palm_visible = False

    # Predict only if hand present in right box (but only collect if hands in both)
    if hand_in_right:
        palm_visible = True
        # Draw landmarks on a white background for prediction
        white_bg = np.ones((roi_right.shape[0], roi_right.shape[1], 3), dtype=np.uint8) * 255
        for hand_landmarks in results_right.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                white_bg,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2),
                mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2)
            )

        gray = cv2.cvtColor(white_bg, cv2.COLOR_BGR2GRAY)
        resized = cv2.resize(gray, (IMG_SIZE, IMG_SIZE))
        normalized = resized / 255.0
        reshaped = normalized.reshape(1, IMG_SIZE, IMG_SIZE, 1)

        pred = model.predict(reshaped, verbose=0)
        label = lb.classes_[np.argmax(pred)]
        last_prediction = label

    current_time = time.time()

    # Collect letter ONLY if collecting mode AND hands present in both boxes
    if collecting and hand_in_left and hand_in_right:
        # Draw blue rectangles to indicate active capture on both boxes
        cv2.rectangle(frame, (left_x1, left_y1), (left_x2, left_y2), (255, 0, 0), 2)
        cv2.rectangle(frame, (right_x1, right_y1), (right_x2, right_y2), (255, 0, 0), 2)
        cv2.putText(frame, f'Hand Visible: {label}', (left_x1, left_y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

        # Append letter every 2 seconds
        if current_time - last_time > 2:
            collected_word += last_prediction
            last_time = current_time
            print(f"[INFO] Added letter: {last_prediction}")

    # Draw ROIs on frame (green if hand detected, red otherwise)
    color_left = (0, 255, 0) if hand_in_left else (0, 0, 255)
    color_right = (0, 255, 0) if hand_in_right else (0, 0, 255)
    cv2.rectangle(frame, (left_x1, left_y1), (left_x2, left_y2), color_left, 2)
    cv2.rectangle(frame, (right_x1, right_y1), (right_x2, right_y2), color_right, 2)

    cv2.putText(frame, f'Prediction: {label}', (right_x1, right_y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Show collected words
    cv2.putText(frame, f'Collected: {collected_word}', (10, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
    cv2.putText(frame, f'Words: {" | ".join(stored_words)}', (10, 100),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

    cv2.imshow('Sign Language Recognition', frame)

    # Handle keys
    key = cv2.waitKey(1) & 0xFF
    if key == ord('s'):
        collected_word = ""
        collecting = True
        print("[INFO] Started collecting letters...")
    elif key == ord('e'):
        collecting = False
        if collected_word:
            stored_words.append(collected_word)
        print("[INFO] Collection stopped.")
        print(f"Formed Words: {' | '.join(stored_words)}")
    elif key == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()




Press 's' to START collecting letters.
Press 'e' to END and show the word.
Press 'q' to QUIT.
[INFO] Started collecting letters...
[INFO] Added letter: H
[INFO] Added letter: I
[INFO] Added letter: M
