Task-04: Develop a hand gesture recognition model that can accurately identify and classify different hand gestures from image or video data, enabling intuitive human-computer interaction and gesture-based control systems.


Dataset Preparation

In [1]:
import os
import shutil
import random

# Source dataset
source_dir = r"C:\Users\Shagufta Umme\Desktop\Hand Gesture\leapGestRecog"

# Target smaller dataset
target_dir = r"C:\Users\Shagufta Umme\Desktop\Hand Gesture\leapGestRecog_small"

# Limit per gesture
num_samples_per_class = 200  

# Correct mapping (from your dataset)
gesture_map = {
    "01_palm": "Palm",
    "02_l": "L",
    "03_fist": "Fist",
    "04_fist_moved": "FistMoved",
    "05_thumb": "Thumb",
    "06_index": "Index",
    "07_ok": "OK",
    "08_palm_moved": "PalmMoved",
    "09_c": "C",
    "10_down": "Down"
}

os.makedirs(target_dir, exist_ok=True)

# Dictionary to collect all images of each gesture
gesture_images = {g: [] for g in gesture_map.keys()}

# Step 1: Gather all image paths across users
for user_folder in os.listdir(source_dir):  # loop through 00–09 users
    user_path = os.path.join(source_dir, user_folder)
    if not os.path.isdir(user_path):
        continue
    
    for gesture_folder in os.listdir(user_path):  # loop gesture subfolders
        if gesture_folder not in gesture_map:
            continue
        
        gesture_path = os.path.join(user_path, gesture_folder)
        if not os.path.isdir(gesture_path):
            continue
        
        all_imgs = [os.path.join(gesture_path, f) for f in os.listdir(gesture_path) if f.endswith('.png')]
        gesture_images[gesture_folder].extend(all_imgs)

# Step 2: For each gesture, sample only 200 images (total, across all users)
for gesture_folder, img_list in gesture_images.items():
    gesture_name = gesture_map[gesture_folder]
    target_class_dir = os.path.join(target_dir, gesture_name)
    os.makedirs(target_class_dir, exist_ok=True)

    selected_files = random.sample(img_list, min(num_samples_per_class, len(img_list)))

    for idx, src in enumerate(selected_files):
        dst = os.path.join(target_class_dir, f"{gesture_folder}_{idx:04d}.png")
        shutil.copy(src, dst)

print("🎉 Smaller dataset created with 200 images per gesture at:", target_dir)


🎉 Smaller dataset created with 200 images per gesture at: C:\Users\Shagufta Umme\Desktop\Hand Gesture\leapGestRecog_small


Model Training & Real-Time Prediction

In [4]:
# 1. Import Libraries
import os
import cv2
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# 2. Dataset Path
data_dir = r"C:\Users\Shagufta Umme\Desktop\Hand Gesture\leapGestRecog_small"

# 3. Data Preprocessing
IMG_SIZE = (64, 64)
BATCH_SIZE = 32

datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_gen = datagen.flow_from_directory(
    data_dir,
    target_size=IMG_SIZE,
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    subset="training",
    shuffle=True
)

val_gen = datagen.flow_from_directory(
    data_dir,
    target_size=IMG_SIZE,
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    subset="validation",
    shuffle=False
)

labels = list(train_gen.class_indices.keys())
print("Labels:", labels)

# 4. CNN Model
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(64,64,1)),
    MaxPooling2D((2,2)),

    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(labels), activation='softmax')
])

model.compile(optimizer=Adam(0.001), loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

# 5. Training
history = model.fit(
    train_gen,
    epochs=10,
    validation_data=val_gen
)

# Save model
model.save("gesture_cnn_model.h5")

# 6. Prediction on Single Image
def predict_image(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, IMG_SIZE)
    img = img.astype("float32") / 255.0
    img = np.expand_dims(img, axis=-1)
    img = np.expand_dims(img, axis=0)

    prediction = model.predict(img)
    class_id = np.argmax(prediction)
    confidence = prediction[0][class_id]
    return labels[class_id], confidence

# Example usage
test_img = r"C:\Users\Shagufta Umme\Desktop\Hand Gesture\leapGestRecog_small\Fist\03_fist_0010.png"
gesture, conf = predict_image(test_img)
print(f"Predicted: {gesture} ({conf*100:.2f}% confidence)")

# 7. Real-Time Webcam Prediction (Hand Only)
def real_time_prediction_with_box(model_path="gesture_cnn_model.h5", box_size=200):
    model = load_model(model_path)
    cap = cv2.VideoCapture(0)

    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                print("Error: Could not read frame.")
                break

            h, w, _ = frame.shape
            x1 = w//2 - box_size//2
            y1 = h//2 - box_size//2
            x2 = w//2 + box_size//2
            y2 = h//2 + box_size//2

            # Crop ROI (hand only)
            roi = frame[y1:y2, x1:x2]
            if roi.size == 0:
                continue

            # Preprocess for prediction
            gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
            img = cv2.resize(gray, IMG_SIZE)
            img = img.astype("float32") / 255.0
            img = np.expand_dims(img, axis=-1)
            img = np.expand_dims(img, axis=0)

            # Predict gesture
            prediction = model.predict(img, verbose=0)
            class_id = np.argmax(prediction)
            confidence = prediction[0][class_id]
            label = labels[class_id]

            # Draw bounding box on ROI
            cv2.rectangle(roi, (0, 0), (box_size-1, box_size-1), (0, 255, 0), 2)
            cv2.putText(roi, f"{label} ({confidence*100:.1f}%)", (5, 25),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

            # Display only the hand ROI (resized for better view)
            display_roi = cv2.resize(roi, (300, 300))
            cv2.imshow("Gesture Recognition (Hand Only)", display_roi)

            # Quit on 'q'
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    finally:
        cap.release()
        cv2.destroyAllWindows()
        print("Webcam released and windows closed.")

# To run real-time hand gesture recognition
real_time_prediction_with_box()


Found 1600 images belonging to 10 classes.
Found 400 images belonging to 10 classes.
Labels: ['C', 'Down', 'Fist', 'FistMoved', 'Index', 'L', 'OK', 'Palm', 'PalmMoved', 'Thumb']


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


  self._warn_if_super_not_called()


[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 88ms/step - accuracy: 0.2640 - loss: 2.0888 - val_accuracy: 0.7900 - val_loss: 0.9073
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 72ms/step - accuracy: 0.6945 - loss: 0.9227 - val_accuracy: 0.8950 - val_loss: 0.3983
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 77ms/step - accuracy: 0.8871 - loss: 0.4083 - val_accuracy: 0.9575 - val_loss: 0.1973
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 76ms/step - accuracy: 0.9273 - loss: 0.2502 - val_accuracy: 0.9775 - val_loss: 0.1239
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 76ms/step - accuracy: 0.9504 - loss: 0.1634 - val_accuracy: 0.9800 - val_loss: 0.1151
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 71ms/step - accuracy: 0.9763 - loss: 0.0903 - val_accuracy: 0.9825 - val_loss: 0.1004
Epoch 7/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step




Predicted: Fist (99.54% confidence)
Webcam released and windows closed.


Model Evaluation & Metrics

In [3]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import os

# 1. Validation Set Metrics
val_gen.reset()  # Make sure validation generator starts from the first batch
loss, val_accuracy = model.evaluate(val_gen)
print(f"Validation Accuracy: {val_accuracy*100:.2f}%\n")

# Predict on validation set
predictions = model.predict(val_gen, verbose=0)
y_pred = np.argmax(predictions, axis=1)
y_true = val_gen.classes

# Class labels
class_labels = list(val_gen.class_indices.keys())

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix (Validation Set):\n", cm)

# Classification report
report = classification_report(y_true, y_pred, target_names=class_labels)
print("\nClassification Report (Validation Set):\n", report)

# ------------------------------
# 2. Test Set Accuracy (Optional)
# ------------------------------
test_dir = r"C:\Users\Shagufta Umme\Desktop\Hand Gesture\TestSet"  # Update path if needed
if os.path.exists(test_dir):
    correct = 0
    total = 0
    for gesture in os.listdir(test_dir):
        gesture_dir = os.path.join(test_dir, gesture)
        for img_name in os.listdir(gesture_dir):
            img_path = os.path.join(gesture_dir, img_name)
            pred_label, _ = predict_image(img_path)
            if pred_label == gesture:
                correct += 1
            total += 1
    test_accuracy = correct / total if total > 0 else 0
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")
else:
    print("\nNo test set folder found. Skipping test accuracy calculation.")


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - accuracy: 0.9911 - loss: 0.0845
Validation Accuracy: 98.75%

Confusion Matrix (Validation Set):
 [[40  0  0  0  0  0  0  0  0  0]
 [ 0 40  0  0  0  0  0  0  0  0]
 [ 0  0 40  0  0  0  0  0  0  0]
 [ 1  0  0 39  0  0  0  0  0  0]
 [ 0  0  0  0 38  2  0  0  0  0]
 [ 0  0  0  0  0 39  0  1  0  0]
 [ 0  0  0  0  0  0 40  0  0  0]
 [ 0  0  0  0  0  0  0 40  0  0]
 [ 0  0  0  1  0  0  0  0 39  0]
 [ 0  0  0  0  0  0  0  0  0 40]]

Classification Report (Validation Set):
               precision    recall  f1-score   support

           C       0.98      1.00      0.99        40
        Down       1.00      1.00      1.00        40
        Fist       1.00      1.00      1.00        40
   FistMoved       0.97      0.97      0.97        40
       Index       1.00      0.95      0.97        40
           L       0.95      0.97      0.96        40
          OK       1.00      1.00      1.00        40
        Palm       0.