In [1]:
import sys
!{sys.executable} -m pip install opencv-python mediapipe tensorflow --quiet

import tensorflow as tf
import cv2
import mediapipe as mp
import numpy as np
import os

print(f"Python Version: {sys.version}")
print(f"TensorFlow Version: {tf.__version__}")

# Verify GPU availability
if tf.test.gpu_device_name():
    print(f"Using GPU: {tf.test.gpu_device_name()}")
else:
    print("GPU not found. Falling back to CPU.")


Python Version: 3.10.15 | packaged by conda-forge | (main, Oct 16 2024, 01:15:49) [MSC v.1941 64 bit (AMD64)]
TensorFlow Version: 2.18.0
GPU not found. Falling back to CPU.


In [2]:
# Path to the saved model
saved_model_path = 'asl_classification_model.keras'

# Load the saved model
try:
    if os.path.exists(saved_model_path):
        model = tf.keras.models.load_model(saved_model_path, compile=False)
        print(f"Model loaded successfully from: {saved_model_path}")
    else:
        raise FileNotFoundError(f"Saved model not found at {saved_model_path}")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# Initialize class labels
class_labels = [
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    'nothing', 'space', 'del'
]
print("Class labels initialized successfully.")


Model loaded successfully from: asl_classification_model.keras
Class labels initialized successfully.


In [3]:
# Initialize Mediapipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)

# Function to preprocess the hand region
def preprocess_hand_region(image, hand_bbox):
    """
    Crops and preprocesses the hand region for model prediction.
    Args:
        image (numpy array): Original image.
        hand_bbox (tuple): Bounding box around the hand (x_min, y_min, x_max, y_max).
    Returns:
        numpy array: Preprocessed image for model prediction.
    """
    x_min, y_min, x_max, y_max = hand_bbox

    # Handle edge cases for bounding box dimensions
    h, w, _ = image.shape
    x_min, y_min = max(0, x_min), max(0, y_min)
    x_max, y_max = min(w, x_max), min(h, y_max)

    # Crop the hand region
    hand_region = image[y_min:y_max, x_min:x_max]

    # Ensure the region is not empty
    if hand_region.size == 0:
        print("Empty hand region detected. Skipping preprocessing.")
        return None

    # Resize, normalize, and expand dimensions
    hand_region = cv2.resize(hand_region, (64, 64))  # Resize to match model input
    hand_region = hand_region / 255.0  # Normalize pixel values
    hand_region = np.expand_dims(hand_region, axis=0)  # Add batch dimension
    return hand_region

print("Mediapipe and preprocessing setup complete.")


Mediapipe and preprocessing setup complete.


In [4]:
def get_hand_bbox(hand_landmarks, image_width, image_height):
    """
    Computes the bounding box for the detected hand based on landmarks.
    Args:
        hand_landmarks: Mediapipe hand landmarks.
        image_width: Width of the original image.
        image_height: Height of the original image.
    Returns:
        tuple: Bounding box coordinates (x_min, y_min, x_max, y_max).
    """
    x_min = int(min([lm.x for lm in hand_landmarks.landmark]) * image_width)
    y_min = int(min([lm.y for lm in hand_landmarks.landmark]) * image_height)
    x_max = int(max([lm.x for lm in hand_landmarks.landmark]) * image_width)
    y_max = int(max([lm.y for lm in hand_landmarks.landmark]) * image_height)
    return max(0, x_min), max(0, y_min), min(image_width, x_max), min(image_height, y_max)

# Open webcam
cap = cv2.VideoCapture(0)  # 0 for the default webcam

if not cap.isOpened():
    raise RuntimeError("Error: Cannot access webcam.")

print("Using webcam for ASL gesture detection...")

try:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("Failed to capture frame. Exiting...")
            break

        # Convert the frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image_height, image_width, _ = frame.shape

        # Process the frame with Mediapipe Hands
        results = hands.process(frame_rgb)

        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # Draw hand landmarks on the frame
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                # Get the bounding box around the hand
                hand_bbox = get_hand_bbox(hand_landmarks, image_width, image_height)

                # Preprocess the hand region for the model
                hand_image = preprocess_hand_region(frame, hand_bbox)

                if hand_image is not None:
                    # Predict the ASL gesture
                    prediction = model.predict(hand_image)
                    predicted_class = class_labels[np.argmax(prediction)]
                    confidence = np.max(prediction)

                    # Display the prediction on the frame
                    cv2.rectangle(frame, (hand_bbox[0], hand_bbox[1]), (hand_bbox[2], hand_bbox[3]), (255, 0, 0), 2)
                    cv2.putText(
                        frame,
                        f"{predicted_class} ({confidence:.2f})",
                        (hand_bbox[0], hand_bbox[1] - 10),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.8,
                        (0, 255, 0),
                        2
                    )

        # Display the processed frame
        cv2.imshow('ASL Gesture Detection', frame)

        # Break the loop on 'q' key press
        if cv2.waitKey(1) & 0xFF == ord('q'):
            print("Exiting...")
            break

finally:
    cap.release()
    cv2.destroyAllWindows()
    print("Webcam processing complete.")


Using webcam for ASL gesture detection...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━