In [3]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import mediapipe as mp

In [4]:
# configuration
model_path = "asl_cnn_model_new.h5"  # path to the saved model
data_direct   = "images"
image_size   = (64, 64) # resize image         
dominant_hand = "Right" # can be changed (but need to train the new model)

In [5]:
# load the trained model
model = load_model(model_path)

# Get the label map from my data folders
datagen = ImageDataGenerator(rescale=1./255)
tmp_gen = datagen.flow_from_directory(
    data_direct,
    target_size=image_size,
    batch_size=1,
    class_mode="categorical",
    shuffle=False
)
idx_to_label = {v:k for k,v in tmp_gen.class_indices.items()}

# Initiaise a MediaPipe for hand detection like cropping the hand 
mp_hands = mp.solutions.hands
hands_detector = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)



Found 280 images belonging to 14 classes.


In [6]:
# Starting a webcam
cap = cv2.VideoCapture(0)

print("Press ESC to quit.")
while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)  
    h, w = frame.shape[:2]

    # Detect hand landmarks
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands_detector.process(rgb)

    if results.multi_hand_landmarks and results.multi_handedness:
        # let one hand = index 0
        hand_landmarks = results.multi_hand_landmarks[0]
        hand_label = results.multi_handedness[0].classification[0].label

        if hand_label == dominant_hand:
            # Compute around landmarks -- like a box
            xs = [int(p.x * w) for p in hand_landmarks.landmark]
            ys = [int(p.y * h) for p in hand_landmarks.landmark]
            M = 20  # pixel margin
            x1, x2 = max(min(xs) - M, 0), min(max(xs) + M, w)
            y1, y2 = max(min(ys) - M, 0), min(max(ys) + M, h)

            # Crop & preprocess for model
            hand_crop = frame[y1:y2, x1:x2]
            hand_resized = cv2.resize(hand_crop, image_size)
            x = hand_resized.astype("float32") / 255.0
            x = np.expand_dims(x, axis=0)  # shape (1, H, W, 3)

            # Prediction
            probs = model.predict(x)[0]
            pred_idx = np.argmax(probs)
            pred_label = idx_to_label[pred_idx]
            confidence = probs[pred_idx]

            # Write the prediction
            cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)
            text = f"{pred_label} ({confidence*100:.1f}%)"
            cv2.putText(frame, text, (x1, y1-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)

    # Show the webcam frame
    cv2.imshow("ASL Real-Time Recognition", frame)
    if cv2.waitKey(1) & 0xFF == 27:  # ESC to stop working
        break
cap.release()
cv2.destroyAllWindows()

Press ESC to quit.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3