In [2]:
import cv2 as cv
import matplotlib.pyplot as plt
import os
import numpy as np
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from datetime import datetime
from tqdm import tqdm

2025-08-23 11:19:37.229424: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-23 11:19:37.236261: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755928177.244292  159375 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755928177.246821  159375 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755928177.253076  159375 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [3]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [4]:
def mediapipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
    return image, results

In [5]:
def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [6]:
def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([ lh, rh])


In [7]:
DATA_PATH = '/home/smayan/Desktop/ASL/dataset/SL'
sequence_length = 30
min_sequences_per_class = 10

In [8]:
# actions = [
#     'a', 'about', 'again', 'all', 'also', 'always', 'and', 'angry', 'animal', 'answer', 
#     'apple', 'ask', 'baby', 'bad', 'bathroom', 'beautiful', 'because', 'bed', 'before', 
#     'big', 'book', 'boy', 'brother', 'but', 'buy', 'bye', 'call', 'can', 'car', 'cat', 
#     'city', 'class', 'clean', 'clothes', 'cold', 'college', 'color', 'come', 'computer', 
#     'cook', 'dad', 'day', 'deaf', 'different', 'doctor', 'dog', 'done', "don't want", 
#     'down', 'drink', 'eat', 'eight', 'enough', 'family', 'fast', 'father', 'feel', 
#     'find', 'fine', 'finish', 'first', 'five', 'food', 'for', 'four', 'friend', 'from', 
#     'get', 'girl', 'give', 'go', 'good', 'goodbye', 'happy', 'hard', 'have', 
#     'head', 'hearing', 'hello', 'help', 'her', 'here', 'home', 'hospital', 'hot', 
#     'house', 'how', 'hungry', 'i', 'if', 'in', 'know', 'language', 'last', 'later', 
#     'learn', 'letter', 'like', 'little bit', 'live', 'look at', 'love', 'make', 'man', 
#     'many', 'me', 'meet', 'milk', 'mom', 'money', 'month', 'more', 'morning', 'mother', 
#     'movie', 'music', 'my', 'name', 'need', 'never', 'new', 'nice', 'night', 'nine', 
#     'no', 'not', 'now', 'old', 'on', 'one', 'open', 'orange', 'our', 'out', 'people', 
#     'phone', 'play', 'please', 'put', 'question', 'read', 'ready', 'red', 'right', 'sad', 
#     'same', 'say', 'school', 'see', 'seven', 'she', 'shirt', 'shoes', 'show', 'sick', 
#     'sign', 'sign language', 'sister', 'sit', 'six', 'sleep', 'slow', 'small', 'sorry', 
#     'stand', 'start', 'stop', 'store', 'story', 'student', 'study', 'talk', 'teach', 
#     'teacher', 'tell', 'ten', 'thank you', 'that']
# # 'the', 'their', 'they', 'thing', 
# #     'think', 'thirsty', 'this', 'three', 'time', 'tired', 'to', 'today', 'tomorrow', 
# #     'two', 'understand', 'up', 'use', 'wait', 'walk', 'want', 'water', 'way', 
# #     'we', 'wear', 'week', 'what', 'when', 'where', 'which', 'white', 'who', 'why', 
# #     'will', 'with', 'woman', 'word', 'work', 'world', 'write', 'wrong', 'year', 'yellow', 
# #     'yes', 'yesterday', 'you', 'your'
# # ]
# label_map = {label: num for num, label in enumerate(actions)}

In [9]:
actions = ['hello', 'student','i','bye','goodbye','college','wrong','how', 'work', 'your', 'want', 'nice', 'to', 'meet', 'doctor', 'time', 'age', 'breakfast', 'sorry', 'love']
label_map = {label: num for num, label in enumerate(actions)}


In [10]:
len(actions)

20

In [11]:
sequences, labels = [], []

In [12]:
# cap = cv.VideoCapture('/home/smayan/Desktop/ASL/dataset/SL/wood/63723.mp4')
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened():

#         ret, frame = cap.read()
#         if not ret:
#             print("Video ended or cannot read the frame.")
#             break

#         image, results = mediapipe_detection(frame, holistic)
            

#         draw_styled_landmarks(image, results)

#         cv.imshow('OpenCV feed', image)
#         if cv.waitKey(10) & 0xFF == ord('q'):
#             break

# cap.release()
# cv.destroyAllWindows()

In [13]:
X = np.load('/media/smayan/500GB SSD/X_augment_min.npy')
y = np.load('/media/smayan/500GB SSD/y_augment_min.npy')

In [14]:
X.shape

(1642, 30, 126)

In [15]:
y.shape

(1642,)

In [16]:
num_features = X.shape[2]
X = X.reshape(X.shape[0], X.shape[1], num_features, 1)

y_categorical = to_categorical(y, num_classes=len(actions))

X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y
)

In [17]:
np.random.seed(42)
perm = np.random.permutation(len(X_train))
X_train = X_train[perm]
y_train = y_train[perm]

In [18]:
X_train = X_train.squeeze(-1)
X_test  = X_test.squeeze(-1)

In [19]:
X.shape

(1642, 30, 126, 1)

In [20]:
X_train.shape

(1313, 30, 126)

In [21]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(enumerate(class_weights))
print(f"Class weights computed: {class_weight_dict}")

Class weights computed: {0: np.float64(2.0525), 1: np.float64(0.8552083333333333), 2: np.float64(1.642), 3: np.float64(1.3683333333333334), 4: np.float64(1.3241935483870968), 5: np.float64(0.9122222222222223), 6: np.float64(1.02625), 7: np.float64(0.9122222222222223), 8: np.float64(0.6729508196721311), 9: np.float64(1.1402777777777777), 10: np.float64(0.7894230769230769), 11: np.float64(1.3683333333333334), 12: np.float64(1.2828125), 13: np.float64(0.6957627118644067), 14: np.float64(0.6126865671641791), 15: np.float64(0.9773809523809524), 16: np.float64(0.9773809523809524), 17: np.float64(1.2828125), 18: np.float64(0.9546511627906977), 19: np.float64(1.001219512195122)}


In [None]:
model = Sequential()

# model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(sequence_length, X.shape[2])))
# model.add(BatchNormalization())
# model.add(MaxPooling1D(pool_size=2))
# model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(sequence_length, X.shape[2])))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))


model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(actions), activation='softmax'))

I0000 00:00:1755928187.151936  159375 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9001 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9
  super().__init__(**kwargs)


In [23]:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = f'logs/wsl_model_{timestamp}'

In [24]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [25]:
model.summary()

In [26]:
X_train.shape

(1313, 30, 126)

In [26]:
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

MONITOR_METRIC = 'val_accuracy'
MONITOR_MODE = 'max' 

log_dir = "/home/smayan/Desktop/ASL/logs"


model_checkpoint_callback = ModelCheckpoint(
    filepath='best_asl_model.keras',
    save_weights_only=False,        
    monitor=MONITOR_METRIC,
    mode=MONITOR_MODE,
    save_best_only=True,            
    verbose=1
)

early_stopping_callback = EarlyStopping(
    monitor=MONITOR_METRIC,
    patience=20,  
    verbose=1,
    mode=MONITOR_MODE,
    restore_best_weights=True 
)

reduce_lr_callback = ReduceLROnPlateau(
    monitor=MONITOR_METRIC,
    factor=0.2,
    patience=10,
    min_lr=1e-7,
    verbose=1,
    mode=MONITOR_MODE
)


callbacks = [
    TensorBoard(log_dir=log_dir, histogram_freq=1),
    model_checkpoint_callback, # <-- ADDED
    early_stopping_callback,   # <-- MODIFIED
    reduce_lr_callback       # <-- MODIFIED
]

In [27]:
history = model.fit(
    X_train, y_train,
    epochs=150,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=callbacks,
    class_weight=class_weight_dict,
    verbose=1
)

Epoch 1/150


I0000 00:00:1755928072.422998  144089 cuda_dnn.cc:529] Loaded cuDNN version 91100


[1m36/42[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 6ms/step - accuracy: 0.0859 - loss: 2.9145
Epoch 1: val_accuracy improved from -inf to 0.15502, saving model to best_asl_model.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.0902 - loss: 2.8950 - val_accuracy: 0.1550 - val_loss: 2.8314 - learning_rate: 0.0010
Epoch 2/150
[1m40/42[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.1769 - loss: 2.5645
Epoch 2: val_accuracy improved from 0.15502 to 0.17021, saving model to best_asl_model.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1779 - loss: 2.5618 - val_accuracy: 0.1702 - val_loss: 2.6007 - learning_rate: 0.0010
Epoch 3/150
[1m41/42[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.2304 - loss: 2.3121
Epoch 3: val_accuracy improved from 0.17021 to 0.27660, saving model to best_asl_model.keras
[1m42/42[0m [32m━━━━━━━━━━━━━

In [28]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")


Test Loss: 0.3939
Test Accuracy: 85.41%


In [29]:
model.save(f'augment_main_less_actions_main_wsl_model_{timestamp}.h5')
print(f"\nModel saved as wsl_model_{timestamp}.h5")




Model saved as wsl_model_20250823-111750.h5


In [27]:
# Load trained model and label map
model = tf.keras.models.load_model('/home/smayan/Desktop/ASL/no face/augment_main_less_actions_main_wsl_model_20250823-111459.h5')
# label_map = np.load('/home/smayan/Desktop/ASL/no face/label_map.npy', allow_pickle=True).item()
actions = list(label_map.keys())

# Variables for prediction
sequence = []
sequence_length = 30
threshold = 0.7

# Start webcam / video
cap = cv.VideoCapture('/home/smayan/Desktop/ASL/ssvid.net--breakfast-in-ASL_1080pFHR.mp4')

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Resize for consistency
        frame = cv.resize(frame, (640, 480))

        # Detection
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Extract keypoints (same as training)
        keypoints = extract_keypoints(results)

        # Append to sequence
        sequence.append(keypoints)
        sequence = sequence[-sequence_length:]

        if len(sequence) == sequence_length:
            input_seq = np.expand_dims(sequence, axis=0)   # shape (1, 30, 1530)

            # Predict
            res = model.predict(input_seq, verbose=0)[0]
            print("res shape:", res.shape)
            print("argmax:", np.argmax(res))

            predicted_action = actions[np.argmax(res)]
            confidence = np.max(res)

            # Show prediction if above threshold
            if confidence > threshold:
                cv.putText(image, f'{predicted_action}: {confidence:.2f}',
                           (10, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Show probabilities
            for i, (action, prob) in enumerate(zip(actions, res)):
                y_pos = 100 + i * 30
                cv.rectangle(image, (10, y_pos), (int(prob * 300) + 10, y_pos + 25), (0, 255, 0), -1)
                cv.putText(image, f'{action}: {prob:.2f}', (15, y_pos + 18),
                           cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)

        # Show output
        cv.imshow('ASL Inference', image)

        # Quit
        if cv.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv.destroyAllWindows()
    

I0000 00:00:1755928194.016961  159375 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1755928194.056495  159787 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.172.08), renderer: NVIDIA GeForce RTX 4070 SUPER/PCIe/SSE2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1755928194.103512  159763 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755928194.120772  159766 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755928194.122894  159770 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755928194.123650  159779 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature 

res shape: (20,)
argmax: 14
res shape: (20,)
argmax: 14
res shape: (20,)
argmax: 14
res shape: (20,)
argmax: 14
res shape: (20,)
argmax: 14
res shape: (20,)
argmax: 14
res shape: (20,)
argmax: 14
res shape: (20,)
argmax: 14
res shape: (20,)
argmax: 14
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 18
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 17
res shape: (20,)
argmax: 17
res shape: (20,)
argmax: 17
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 1
res shape: (20,)
argmax: 17
res shape: (20,)
argmax: 17
res shape: (20,)
argmax: 17
res shape: (20,)
argmax: 17
res shape: 

In [31]:
print("Model output shape:", model.output_shape)  # e.g. (None, 10)
print("Number of actions:", len(actions))
print("Actions:", actions)


Model output shape: (None, 20)
Number of actions: 20
Actions: ['hello', 'student', 'i', 'bye', 'goodbye', 'college', 'wrong', 'how', 'work', 'your', 'want', 'nice', 'to', 'meet', 'doctor', 'time', 'age', 'breakfast', 'sorry', 'love']
