In [16]:
import cv2 as cv
import matplotlib.pyplot as plt
import os
import numpy as np
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from datetime import datetime
from tqdm import tqdm

In [17]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [18]:
def mediapipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
    return image, results

In [19]:
def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             )
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [20]:
def extract_keypoints(results):
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([face, lh, rh])


In [21]:
DATA_PATH = '/home/smayan/Desktop/ASL/dataset/SL'
sequence_length = 30
min_sequences_per_class = 10

In [22]:
actions = [
    'a', 'about', 'again', 'all', 'also', 'always', 'and', 'angry', 'animal', 'answer', 
    'apple', 'ask', 'baby', 'bad', 'bathroom', 'beautiful', 'because', 'bed', 'before', 
    'big', 'book', 'boy', 'brother', 'but', 'buy', 'bye', 'call', 'can', 'car', 'cat', 
    'city', 'class', 'clean', 'clothes', 'cold', 'college', 'color', 'come', 'computer', 
    'cook', 'dad', 'day', 'deaf', 'different', 'doctor', 'dog', 'done', "don't want", 
    'down', 'drink', 'eat', 'eight', 'enough', 'family', 'fast', 'father', 'feel', 
    'find', 'fine', 'finish', 'first', 'five', 'food', 'for', 'four', 'friend', 'from', 
    'get', 'girl', 'give', 'go', 'good', 'goodbye', 'happy', 'hard', 'have', 
    'head', 'hearing', 'hello', 'help', 'her', 'here', 'home', 'hospital', 'hot', 
    'house', 'how', 'hungry', 'i', 'if', 'in', 'know', 'language', 'last', 'later', 
    'learn', 'letter', 'like', 'little bit', 'live', 'look at', 'love', 'make', 'man', 
    'many', 'me', 'meet', 'milk', 'mom', 'money', 'month', 'more', 'morning', 'mother', 
    'movie', 'music', 'my', 'name', 'need', 'never', 'new', 'nice', 'night', 'nine', 
    'no', 'not', 'now', 'old', 'on', 'one', 'open', 'orange', 'our', 'out', 'people', 
    'phone', 'play', 'please', 'put', 'question', 'read', 'ready', 'red', 'right', 'sad', 
    'same', 'say', 'school', 'see', 'seven', 'she', 'shirt', 'shoes', 'show', 'sick', 
    'sign', 'sign language', 'sister', 'sit', 'six', 'sleep', 'slow', 'small', 'sorry', 
    'stand', 'start', 'stop', 'store', 'story', 'student', 'study', 'talk', 'teach', 
    'teacher', 'tell', 'ten', 'thank you', 'that']
# 'the', 'their', 'they', 'thing', 
#     'think', 'thirsty', 'this', 'three', 'time', 'tired', 'to', 'today', 'tomorrow', 
#     'two', 'understand', 'up', 'use', 'wait', 'walk', 'want', 'water', 'way', 
#     'we', 'wear', 'week', 'what', 'when', 'where', 'which', 'white', 'who', 'why', 
#     'will', 'with', 'woman', 'word', 'work', 'world', 'write', 'wrong', 'year', 'yellow', 
#     'yes', 'yesterday', 'you', 'your'
# ]
label_map = {label: num for num, label in enumerate(actions)}

In [23]:
np.save('label_map.npy', label_map)

In [24]:
label_map

{'a': 0,
 'about': 1,
 'again': 2,
 'all': 3,
 'also': 4,
 'always': 5,
 'and': 6,
 'angry': 7,
 'animal': 8,
 'answer': 9,
 'apple': 10,
 'ask': 11,
 'baby': 12,
 'bad': 13,
 'bathroom': 14,
 'beautiful': 15,
 'because': 16,
 'bed': 17,
 'before': 18,
 'big': 19,
 'book': 20,
 'boy': 21,
 'brother': 22,
 'but': 23,
 'buy': 24,
 'bye': 25,
 'call': 26,
 'can': 27,
 'car': 28,
 'cat': 29,
 'city': 30,
 'class': 31,
 'clean': 32,
 'clothes': 33,
 'cold': 34,
 'college': 35,
 'color': 36,
 'come': 37,
 'computer': 38,
 'cook': 39,
 'dad': 40,
 'day': 41,
 'deaf': 42,
 'different': 43,
 'doctor': 44,
 'dog': 45,
 'done': 46,
 "don't want": 47,
 'down': 48,
 'drink': 49,
 'eat': 50,
 'eight': 51,
 'enough': 52,
 'family': 53,
 'fast': 54,
 'father': 55,
 'feel': 56,
 'find': 57,
 'fine': 58,
 'finish': 59,
 'first': 60,
 'five': 61,
 'food': 62,
 'for': 63,
 'four': 64,
 'friend': 65,
 'from': 66,
 'get': 67,
 'girl': 68,
 'give': 69,
 'go': 70,
 'good': 71,
 'goodbye': 72,
 'happy': 73,
 '

In [25]:
len(actions)

178

In [26]:
sequences, labels = [], []

In [27]:
X = np.load('/media/smayan/500GB SSD/X.npy')
y = np.load('/media/smayan/500GB SSD/y.npy')

In [28]:
num_features = X.shape[2]
X = X.reshape(X.shape[0], X.shape[1], num_features, 1)

y_categorical = to_categorical(y, num_classes=len(actions))

X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y
)

In [29]:
X_train = X_train.squeeze(-1)  # Now shape = (6314, 30, 1530)
X_test  = X_test.squeeze(-1)

In [31]:
# Load trained model and label map
model = tf.keras.models.load_model('/home/smayan/Desktop/ASL/main_wsl_model_20250819-203608.h5')
label_map = np.load('/home/smayan/Desktop/ASL/label_map.npy', allow_pickle=True).item()
actions = list(label_map.keys())

# Variables for prediction
sequence = []
sequence_length = 30
threshold = 0.7

# Start webcam / video
cap = cv.VideoCapture('ssvid.net---How-to-sign-I-m-the-doctor_1080p.mp4')

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Resize for consistency
        frame = cv.resize(frame, (640, 480))

        # Detection
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Extract keypoints (same as training)
        keypoints = extract_keypoints(results)

        # Append to sequence
        sequence.append(keypoints)
        sequence = sequence[-sequence_length:]

        if len(sequence) == sequence_length:
            input_seq = np.expand_dims(sequence, axis=0)   # shape (1, 30, 1530)

            # Predict
            res = model.predict(input_seq, verbose=0)[0]
            predicted_action = actions[np.argmax(res)]
            confidence = np.max(res)

            # Show prediction if above threshold
            if confidence > threshold:
                cv.putText(image, f'{predicted_action}: {confidence:.2f}',
                           (10, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Show probabilities
            for i, (action, prob) in enumerate(zip(actions, res)):
                y_pos = 100 + i * 30
                cv.rectangle(image, (10, y_pos), (int(prob * 300) + 10, y_pos + 25), (0, 255, 0), -1)
                cv.putText(image, f'{action}: {prob:.2f}', (15, y_pos + 18),
                           cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)

        # Show output
        cv.imshow('ASL Inference', image)

        # Quit
        if cv.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv.destroyAllWindows()


I0000 00:00:1755701043.487363   36174 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1755701043.531717   43994 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.172.08), renderer: NVIDIA GeForce RTX 4070 SUPER/PCIe/SSE2
W0000 00:00:1755701043.573657   43974 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755701043.592137   43970 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755701043.593034   43975 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755701043.593055   43979 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000