# Real-time ASL gesture predictor (Webcam)

This notebook loads the top models (MIL + 1-2 deeper models) and runs a simple real-time predictor using your webcam. It uses MediaPipe Hands to extract landmarks, maintains a sliding buffer of frames to form a window, and computes predictions from the deeper model and MIL model, then averages their probabilities for an ensemble prediction.

Notes:
- This is a demo notebook for quick local testing inside a Python environment with OpenCV and MediaPipe installed.
- The MIL model requires a bag of windows; for low-latency demo the bag is constructed by repeating the most recent window to the required bag size (you can replace this with a sliding-bag buffer if you prefer).
- Adjust `MODEL_PATH_*` variables below to pick the exact top models you want to use.

In [2]:
# Imports and model loading
import cv2
import numpy as np
import time
import json
import tensorflow as tf
from collections import deque
import sys, os
sys.path.insert(0, os.getcwd())
# Choose top models (MIL + 1-2 deeper models). Update paths if needed.
MIL_MODEL = '../Models/gesture_wlasl_mil_finetuned_mil.keras'
DEEPER_MODEL = '../Models/gesture_wlasl_deeper_encinit_long.keras'
# Optional second deeper model (ensemble of 3) - set to None to skip
DEEPER_MODEL_2 = '../Models/gesture_wlasl_deeper_encinit_long.bilstm.keras'
MIL_LABELS = '../Models/gesture_wlasl_mil_finetuned_mil_labels.json'
DEEPER_LABELS = '../Models/gesture_wlasl_deeper_encinit_long_labels.json'
DEEPER2_LABELS = '../Models/gesture_wlasl_deeper_encinit_long_labels.json'

# load labels
with open(MIL_LABELS) as f:
    mil_labels = json.load(f)
with open(DEEPER_LABELS) as f:
    deeper_labels = json.load(f)
# deeper2 uses same label file here; adjust if different
with open(DEEPER2_LABELS) as f:
    deeper2_labels = json.load(f)

# helper to attempt loading full model, else return None (we'll try to load weights later)
def load_model_try(path):
    try:
        return tf.keras.models.load_model(path)
    except Exception as e:
        print('Full model load failed for', path, '->', e)
        return None

deeper = load_model_try(DEEPER_MODEL)
deeper2 = None
if DEEPER_MODEL_2:
    deeper2 = load_model_try(DEEPER_MODEL_2)
mil = load_model_try(MIL_MODEL)

# If MIL failed to load as full model, attempt to load weights into a rebuilt MIL architecture at runtime when we know shapes. We'll handle that in setup below.
print('Model load status: deeper=', bool(deeper), 'deeper2=', bool(deeper2), 'mil=', bool(mil))

Full model load failed for ../Models/gesture_wlasl_mil_finetuned_mil.keras -> The `{arg_name}` of this `Lambda` layer is a Python lambda. Deserializing it is unsafe. If you trust the source of the config artifact, you can override this error by passing `safe_mode=False` to `from_config()`, or calling `keras.config.enable_unsafe_deserialization().
Model load status: deeper= True deeper2= True mil= False


In [3]:
# MediaPipe helper and preprocessing utilities
import mediapipe as mp
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5)
mp_draw = mp.solutions.drawing_utils

def extract_landmarks(img):
    # img must be BGR as from cv2
    rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    res = hands.process(rgb)
    if not res.multi_hand_landmarks:
        return None
    lm = res.multi_hand_landmarks[0]
    feats = []
    for p in lm.landmark:
        feats.extend([p.x, p.y, p.z])
    return np.array(feats, dtype=np.float32)

def make_window_from_buffer(buf, window_len, feat_dim):
    # buf: deque of recent frame feature vectors (most recent at end)
    arr = np.array(list(buf)[-window_len:], dtype=np.float32)
    # if features dim mismatch, pad or trim
    if arr.shape[1] < feat_dim:
        pad = np.zeros((arr.shape[0], feat_dim - arr.shape[1]), dtype=np.float32)
        arr = np.concatenate([arr, pad], axis=1)
    elif arr.shape[1] > feat_dim:
        arr = arr[:, :feat_dim]
    return arr

I0000 00:00:1757805593.049939 7184508 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M4 Pro


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1757805593.064632 7186325 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757805593.069567 7186334 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [None]:
# Setup shapes & possibly rebuild MIL if needed
# Determine window length and feature dim using deeper model if loaded, else try to infer from deeper model file name or default to 16x126
default_window = 16
default_feat = 126
if deeper is not None:
    try:
        win = int(deeper.input_shape[1])
        feat = int(deeper.input_shape[2])
    except Exception:
        win, feat = default_window, default_feat
else:
    win, feat = default_window, default_feat

# bag size we'll use for MIL inference (must match MIL training bag size for best behavior)
bag_size = 32

# If MIL model failed to load as full model, attempt to reconstruct and load weights using known shapes
if mil is None:
    print('Rebuilding MIL architecture and loading weights...')
    sys.path.insert(0, os.getcwd())
    from Code.train_tf_mil import build_encoder
    encoder = build_encoder((win, feat), embed_dim=128)
    bag_input = tf.keras.layers.Input(shape=(bag_size, win, feat), dtype=tf.float32)
    td = tf.keras.layers.TimeDistributed(encoder)(bag_input)
    att_dense = tf.keras.layers.Dense(1)(td)
    att = tf.keras.layers.Softmax(axis=1)(att_dense)
    pooled = tf.keras.layers.Lambda(lambda x: tf.matmul(x[0], x[1], transpose_a=True), name='pool')([td, att])
    pooled = tf.keras.layers.Reshape((128,))(pooled)
    out = tf.keras.layers.Dense(128, activation='relu')(pooled)
    out = tf.keras.layers.Dense(len(mil_labels), activation='softmax')(out)
    mil = tf.keras.Model(bag_input, out)
    # try to load weights file (either .keras or .weights.h5)
    try:
        mil.load_weights(MIL_MODEL)
    except Exception:
        if os.path.exists(MIL_MODEL + '.weights.h5'):
            mil.load_weights(MIL_MODEL + '.weights.h5')
print('Using window_len=', win, 'feat=', feat, 'bag_size=', bag_size)

Rebuilding MIL architecture and loading weights...


ModuleNotFoundError: No module named 'Code'

In [None]:
# Real-time loop: capture webcam, extract landmarks, buffer windows, run deeper+MIL ensemble and display overlay
from time import time
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise SystemExit('Cannot open webcam')

frame_buffer = deque(maxlen=win)  # store last `win` feature vectors
fps_t0 = time()
fps_count = 0
try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # small mirror for usability
        frame = cv2.flip(frame, 1)
        feats = extract_landmarks(frame)
        if feats is not None:
            frame_buffer.append(feats)
        # default text
        label_text = 'No hand'
        if len(frame_buffer) >= win:
            window = make_window_from_buffer(frame_buffer, win, feat)
            # deeper prediction: expect input shape (batch, win, feat)
            deeper_probs = deeper.predict(window[np.newaxis,...], verbose=0)[0] if deeper is not None else np.zeros((len(mil_labels),))
            # map deeper probs to mil label space
            mil_from_deeper = np.zeros(len(mil_labels), dtype=float)
            for name, di in deeper_labels.items():
                di = int(di)
                if name in mil_labels and di < len(deeper_probs):
                    mil_from_deeper[int(mil_labels[name])] = deeper_probs[di]
            # build a bag by repeating current window to bag_size for demo
            bag = np.repeat(window[np.newaxis,...], bag_size, axis=0)
            mil_input = np.expand_dims(bag, axis=0)  # shape (1, bag_size, win, feat)
            mil_probs = mil.predict(mil_input, verbose=0)[0]
            avg_prob = (mil_from_deeper + mil_probs) / 2.0
            pred_idx = int(np.argmax(avg_prob))
            # find label name
            pred_name = None
            for k,v in mil_labels.items():
                if int(v) == pred_idx:
                    pred_name = k
                    break
            label_text = pred_name if pred_name is not None else 'unknown'
        # overlay text
        cv2.putText(frame, f'Pred: {label_text}', (10,30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2)
        # fps
        fps_count += 1
        if time() - fps_t0 >= 1.0:
            fps = fps_count / (time() - fps_t0)
            fps_t0 = time()
            fps_count = 0
        cv2.imshow('ASL Realtime', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
finally:
    cap.release()
    cv2.destroyAllWindows()