In [2]:
import cv2 as cv
import matplotlib.pyplot as plt

import numpy as np
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from datetime import datetime
from tqdm import tqdm

In [3]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [4]:
def mediapipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
    return image, results

In [5]:
def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [6]:
def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([ lh, rh])


In [7]:
DATA_PATH = '/home/smayan/Desktop/ASL/dataset/SL'
sequence_length = 30
min_sequences_per_class = 10

In [8]:
# actions = [
#     'a', 'about', 'again', 'all', 'also', 'always', 'and', 'angry', 'animal', 'answer', 
#     'apple', 'ask', 'baby', 'bad', 'bathroom', 'beautiful', 'because', 'bed', 'before', 
#     'big', 'book', 'boy', 'brother', 'but', 'buy', 'bye', 'call', 'can', 'car', 'cat', 
#     'city', 'class', 'clean', 'clothes', 'cold', 'college', 'color', 'come', 'computer', 
#     'cook', 'dad', 'day', 'deaf', 'different', 'doctor', 'dog', 'done', "don't want", 
#     'down', 'drink', 'eat', 'eight', 'enough', 'family', 'fast', 'father', 'feel', 
#     'find', 'fine', 'finish', 'first', 'five', 'food', 'for', 'four', 'friend', 'from', 
#     'get', 'girl', 'give', 'go', 'good', 'goodbye', 'happy', 'hard', 'have', 
#     'head', 'hearing', 'hello', 'help', 'her', 'here', 'home', 'hospital', 'hot', 
#     'house', 'how', 'hungry', 'i', 'if', 'in', 'know', 'language', 'last', 'later', 
#     'learn', 'letter', 'like', 'little bit', 'live', 'look at', 'love', 'make', 'man', 
#     'many', 'me', 'meet', 'milk', 'mom', 'money', 'month', 'more', 'morning', 'mother', 
#     'movie', 'music', 'my', 'name', 'need', 'never', 'new', 'nice', 'night', 'nine', 
#     'no', 'not', 'now', 'old', 'on', 'one', 'open', 'orange', 'our', 'out', 'people', 
#     'phone', 'play', 'please', 'put', 'question', 'read', 'ready', 'red', 'right', 'sad', 
#     'same', 'say', 'school', 'see', 'seven', 'she', 'shirt', 'shoes', 'show', 'sick', 
#     'sign', 'sign language', 'sister', 'sit', 'six', 'sleep', 'slow', 'small', 'sorry', 
#     'stand', 'start', 'stop', 'store', 'story', 'student', 'study', 'talk', 'teach', 
#     'teacher', 'tell', 'ten', 'thank you', 'that']
# # 'the', 'their', 'they', 'thing', 
# #     'think', 'thirsty', 'this', 'three', 'time', 'tired', 'to', 'today', 'tomorrow', 
# #     'two', 'understand', 'up', 'use', 'wait', 'walk', 'want', 'water', 'way', 
# #     'we', 'wear', 'week', 'what', 'when', 'where', 'which', 'white', 'who', 'why', 
# #     'will', 'with', 'woman', 'word', 'work', 'world', 'write', 'wrong', 'year', 'yellow', 
# #     'yes', 'yesterday', 'you', 'your'
# # ]
# label_map = {label: num for num, label in enumerate(actions)}

In [9]:
# actions = ['hello', 'student','i','bye','goodbye','college','wrong','how', 'work', 'your', 'want', 'nice', 'to', 'meet', 'doctor', 'time', 'age', 'breakfast', 'sorry', 'love']
actions = ["Hello","Help","How are you?", "See you later"]
label_map = {label: num for num, label in enumerate(actions)}


In [10]:
len(actions)

4

In [11]:
sequences, labels = [], []

In [12]:
# cap = cv.VideoCapture('/home/smayan/Desktop/ASL/dataset/SL/wood/63723.mp4')
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened():

#         ret, frame = cap.read()
#         if not ret:
#             print("Video ended or cannot read the frame.")
#             break

#         image, results = mediapipe_detection(frame, holistic)
            

#         draw_styled_landmarks(image, results)

#         cv.imshow('OpenCV feed', image)
#         if cv.waitKey(10) & 0xFF == ord('q'):
#             break

# cap.release()
# cv.destroyAllWindows()

In [13]:
X = np.load('/home/smayan/Desktop/ASL/dataset/custom/X.npy')
y = np.load('/home/smayan/Desktop/ASL/dataset/custom/y.npy')

In [14]:
X.shape

(120, 30, 126)

In [15]:
y.shape

(120,)

In [16]:
num_features = X.shape[2]
X = X.reshape(X.shape[0], X.shape[1], num_features, 1)

y_categorical = to_categorical(y, num_classes=len(actions))

X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y
)

In [17]:
np.random.seed(42)
perm = np.random.permutation(len(X_train))
X_train = X_train[perm]
y_train = y_train[perm]

In [18]:
X_train = X_train.squeeze(-1)
X_test  = X_test.squeeze(-1)

In [19]:
X.shape

(120, 30, 126, 1)

In [20]:
X_train.shape

(96, 30, 126)

In [21]:
y.shape

(120,)

In [22]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [23]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(enumerate(class_weights))
print(f"Class weights computed: {class_weight_dict}")

Class weights computed: {0: np.float64(1.0), 1: np.float64(1.0), 2: np.float64(1.0), 3: np.float64(1.0)}


In [45]:
model = Sequential()

# model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(sequence_length, X.shape[2])))
# model.add(BatchNormalization())
# model.add(MaxPooling1D(pool_size=2))
# model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(sequence_length, X.shape[2])))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))


model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(actions), activation='softmax'))

2025-08-26 21:10:17.419093: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
  super().__init__(**kwargs)


In [46]:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = f'logs/wsl_model_{timestamp}'

In [47]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [48]:
model.summary()

In [49]:
X_train.shape

(96, 30, 126)

In [50]:
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

MONITOR_METRIC = 'val_accuracy'
MONITOR_MODE = 'max' 

log_dir = "/home/smayan/Desktop/ASL/logs"


model_checkpoint_callback = ModelCheckpoint(
    filepath='best_asl_model.keras',
    save_weights_only=False,        
    monitor=MONITOR_METRIC,
    mode=MONITOR_MODE,
    save_best_only=True,            
    verbose=1
)

early_stopping_callback = EarlyStopping(
    monitor=MONITOR_METRIC,
    patience=20,  
    verbose=1,
    mode=MONITOR_MODE,
    restore_best_weights=True 
)

reduce_lr_callback = ReduceLROnPlateau(
    monitor=MONITOR_METRIC,
    factor=0.2,
    patience=10,
    min_lr=1e-7,
    verbose=1,
    mode=MONITOR_MODE
)


callbacks = [
    TensorBoard(log_dir=log_dir, histogram_freq=1),
    model_checkpoint_callback, # <-- ADDED
    early_stopping_callback,   # <-- MODIFIED
    reduce_lr_callback       # <-- MODIFIED
]

In [51]:
history = model.fit(
    X_train, y_train,
    epochs=150,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=callbacks,
    class_weight=class_weight_dict,
    verbose=1
)

Epoch 1/150
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m4s[0m 2s/step - accuracy: 0.1875 - loss: 1.4558
Epoch 1: val_accuracy improved from -inf to 0.25000, saving model to best_asl_model.keras
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 291ms/step - accuracy: 0.3268 - loss: 1.3531 - val_accuracy: 0.2500 - val_loss: 1.2883 - learning_rate: 0.0010
Epoch 2/150
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 28ms/step - accuracy: 0.5625 - loss: 0.9813
Epoch 2: val_accuracy improved from 0.25000 to 0.29167, saving model to best_asl_model.keras
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - accuracy: 0.6367 - loss: 0.9068 - val_accuracy: 0.2917 - val_loss: 1.3008 - learning_rate: 0.0010
Epoch 3/150
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 31ms/step - accuracy: 0.8125 - loss: 0.6394
Epoch 3: val_accuracy did not improve from 0.29167
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/st

In [52]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")


Test Loss: 0.2752
Test Accuracy: 100.00%


In [53]:
model.save(f'augment_main_less_actions_main_wsl_model_{timestamp}.h5')
print(f"\nModel saved as wsl_model_{timestamp}.h5")




Model saved as wsl_model_20250826-211017.h5


In [24]:
# Load trained model and label map
model = tf.keras.models.load_model('/home/smayan/Desktop/ASL/no face/augment_main_less_actions_main_wsl_model_20250826-211017.h5')
# label_map = np.load('/home/smayan/Desktop/ASL/no face/label_map.npy', allow_pickle=True).item()
actions = list(label_map.keys())

# Variables for prediction
sequence = []
sequence_length = 30
threshold = 0.7

# Start webcam / video
cap = cv.VideoCapture('/home/smayan/Desktop/ASL/ssvid.net--breakfast-in-ASL_1080pFHR.mp4')

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Resize for consistency
        frame = cv.resize(frame, (640, 480))

        # Detection
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Extract keypoints (same as training)
        keypoints = extract_keypoints(results)

        # Append to sequence
        sequence.append(keypoints)
        sequence = sequence[-sequence_length:]

        if len(sequence) == sequence_length:
            input_seq = np.expand_dims(sequence, axis=0)   # shape (1, 30, 1530)

            # Predict
            res = model.predict(input_seq, verbose=0)[0]
            print("res shape:", res.shape)
            print("argmax:", np.argmax(res))

            predicted_action = actions[np.argmax(res)]
            confidence = np.max(res)

            # Show prediction if above threshold
            if confidence > threshold:
                cv.putText(image, f'{predicted_action}: {confidence:.2f}',
                           (10, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Show probabilities
            for i, (action, prob) in enumerate(zip(actions, res)):
                y_pos = 100 + i * 30
                cv.rectangle(image, (10, y_pos), (int(prob * 300) + 10, y_pos + 25), (0, 255, 0), -1)
                cv.putText(image, f'{action}: {prob:.2f}', (15, y_pos + 18),
                           cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)

        # Show output
        cv.imshow('ASL Inference', image)

        # Quit
        if cv.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv.destroyAllWindows()
    

2025-08-26 21:11:31.988770: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
I0000 00:00:1756222892.178231   31821 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1756222892.187844   32196 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.1), renderer: llvmpipe (LLVM 19.1.1, 256 bits)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1756222892.231070   32112 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1756222892.256641   32136 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1756222892.258246   32120 inference_feedback_manager.cc:114] Feedback manager

res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1
res shape: (4,)
argmax: 1


In [None]:
print("Model output shape:", model.output_shape)  # e.g. (None, 10)
print("Number of actions:", len(actions))
print("Actions:", actions)


Model output shape: (None, 20)
Number of actions: 20
Actions: ['hello', 'student', 'i', 'bye', 'goodbye', 'college', 'wrong', 'how', 'work', 'your', 'want', 'nice', 'to', 'meet', 'doctor', 'time', 'age', 'breakfast', 'sorry', 'love']
