In [1]:
import cv2 as cv
import matplotlib.pyplot as plt
import os
import numpy as np
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from datetime import datetime
from tqdm import tqdm

2025-08-22 19:35:34.376744: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-22 19:35:34.383636: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755871534.391899  296838 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755871534.394286  296838 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755871534.401029  296838 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [3]:
def mediapipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
    return image, results

In [4]:
def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [5]:
def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([ lh, rh])


In [6]:
DATA_PATH = '/home/smayan/Desktop/ASL/dataset/SL'
sequence_length = 30
min_sequences_per_class = 10

In [7]:
# actions = [
#     'a', 'about', 'again', 'all', 'also', 'always', 'and', 'angry', 'animal', 'answer', 
#     'apple', 'ask', 'baby', 'bad', 'bathroom', 'beautiful', 'because', 'bed', 'before', 
#     'big', 'book', 'boy', 'brother', 'but', 'buy', 'bye', 'call', 'can', 'car', 'cat', 
#     'city', 'class', 'clean', 'clothes', 'cold', 'college', 'color', 'come', 'computer', 
#     'cook', 'dad', 'day', 'deaf', 'different', 'doctor', 'dog', 'done', "don't want", 
#     'down', 'drink', 'eat', 'eight', 'enough', 'family', 'fast', 'father', 'feel', 
#     'find', 'fine', 'finish', 'first', 'five', 'food', 'for', 'four', 'friend', 'from', 
#     'get', 'girl', 'give', 'go', 'good', 'goodbye', 'happy', 'hard', 'have', 
#     'head', 'hearing', 'hello', 'help', 'her', 'here', 'home', 'hospital', 'hot', 
#     'house', 'how', 'hungry', 'i', 'if', 'in', 'know', 'language', 'last', 'later', 
#     'learn', 'letter', 'like', 'little bit', 'live', 'look at', 'love', 'make', 'man', 
#     'many', 'me', 'meet', 'milk', 'mom', 'money', 'month', 'more', 'morning', 'mother', 
#     'movie', 'music', 'my', 'name', 'need', 'never', 'new', 'nice', 'night', 'nine', 
#     'no', 'not', 'now', 'old', 'on', 'one', 'open', 'orange', 'our', 'out', 'people', 
#     'phone', 'play', 'please', 'put', 'question', 'read', 'ready', 'red', 'right', 'sad', 
#     'same', 'say', 'school', 'see', 'seven', 'she', 'shirt', 'shoes', 'show', 'sick', 
#     'sign', 'sign language', 'sister', 'sit', 'six', 'sleep', 'slow', 'small', 'sorry', 
#     'stand', 'start', 'stop', 'store', 'story', 'student', 'study', 'talk', 'teach', 
#     'teacher', 'tell', 'ten', 'thank you', 'that']
# # 'the', 'their', 'they', 'thing', 
# #     'think', 'thirsty', 'this', 'three', 'time', 'tired', 'to', 'today', 'tomorrow', 
# #     'two', 'understand', 'up', 'use', 'wait', 'walk', 'want', 'water', 'way', 
# #     'we', 'wear', 'week', 'what', 'when', 'where', 'which', 'white', 'who', 'why', 
# #     'will', 'with', 'woman', 'word', 'work', 'world', 'write', 'wrong', 'year', 'yellow', 
# #     'yes', 'yesterday', 'you', 'your'
# # ]
# label_map = {label: num for num, label in enumerate(actions)}

In [8]:
actions = ['hello', 'student','i','bye','goodbye','college','wrong','how', 'work', 'your', 'want', 'nice', 'to', 'meet', 'doctor', 'time', 'age', 'breakfast']
label_map = {label: num for num, label in enumerate(actions)}


In [9]:
label_map = {action: i for i, action in enumerate(actions)}
np.save('label_map.npy', label_map)


In [10]:
len(actions)

18

In [11]:
sequences, labels = [], []

In [12]:
# cap = cv.VideoCapture('/home/smayan/Desktop/ASL/dataset/SL/wood/63723.mp4')
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened():

#         ret, frame = cap.read()
#         if not ret:
#             print("Video ended or cannot read the frame.")
#             break

#         image, results = mediapipe_detection(frame, holistic)
            

#         draw_styled_landmarks(image, results)

#         cv.imshow('OpenCV feed', image)
#         if cv.waitKey(10) & 0xFF == ord('q'):
#             break

# cap.release()
# cv.destroyAllWindows()

In [13]:
X = np.load('/media/smayan/500GB SSD/X_min.npy')
y = np.load('/media/smayan/500GB SSD/y_min.npy')

In [14]:
X.shape

(737, 30, 126)

In [15]:
y.shape

(737,)

In [16]:
num_features = X.shape[2]
X = X.reshape(X.shape[0], X.shape[1], num_features, 1)

y_categorical = to_categorical(y, num_classes=len(actions))

X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y
)

In [17]:
X_train = X_train.squeeze(-1)
X_test  = X_test.squeeze(-1)

In [18]:
X.shape

(737, 30, 126, 1)

In [19]:
X_train.shape

(589, 30, 126)

In [20]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(enumerate(class_weights))
print(f"Class weights computed: {class_weight_dict}")

Class weights computed: {0: 2.047222222222222, 1: 0.8530092592592593, 2: 1.6377777777777778, 3: 1.364814814814815, 4: 1.32078853046595, 5: 0.9098765432098765, 6: 1.023611111111111, 7: 0.9098765432098765, 8: 0.6712204007285975, 9: 1.1373456790123457, 10: 0.7873931623931624, 11: 1.364814814814815, 12: 1.2795138888888888, 13: 0.6939736346516008, 14: 0.6111111111111112, 15: 0.9748677248677249, 16: 0.9748677248677249, 17: 1.2795138888888888}


In [21]:
model = Sequential()

# model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(sequence_length, X.shape[2])))
# model.add(BatchNormalization())
# model.add(MaxPooling1D(pool_size=2))
# model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(sequence_length, X.shape[2])))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))


model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(actions), activation='softmax'))

I0000 00:00:1755871535.795939  296838 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1231 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9
  super().__init__(**kwargs)


In [22]:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = f'logs/wsl_model_{timestamp}'

In [23]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [24]:
model.summary()

In [25]:
X_train.shape

(589, 30, 126)

In [26]:
# callbacks = [
#     TensorBoard(log_dir=log_dir, histogram_freq=1),
#     EarlyStopping(monitor='val_loss', restore_best_weights=True),
#     ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=1e-7)
# ]

In [27]:
history = model.fit(
    X_train, y_train,
    epochs=150,
    batch_size=32,
    validation_data=(X_test, y_test),
    #callbacks=callbacks,
    class_weight=class_weight_dict,
    verbose=1
)

Epoch 1/150


I0000 00:00:1755871537.822352  297183 cuda_dnn.cc:529] Loaded cuDNN version 91100


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.0868 - loss: 2.8848 - val_accuracy: 0.0743 - val_loss: 2.7777
Epoch 2/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1757 - loss: 2.4896 - val_accuracy: 0.1149 - val_loss: 2.6916
Epoch 3/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2322 - loss: 2.2824 - val_accuracy: 0.1892 - val_loss: 2.5348
Epoch 4/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2726 - loss: 2.1256 - val_accuracy: 0.1689 - val_loss: 2.4943
Epoch 5/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2905 - loss: 2.0318 - val_accuracy: 0.2095 - val_loss: 2.4294
Epoch 6/150
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3080 - loss: 1.9546 - val_accuracy: 0.2432 - val_loss: 2.3555
Epoch 7/150
[1m19/19[0m [32m━━━━━━━━━━━━━━

In [28]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")


Test Loss: 1.1726
Test Accuracy: 68.24%


In [35]:
model.save(f'main_less_actions_main_wsl_model_{timestamp}.h5')
print(f"\nModel saved as wsl_model_{timestamp}.h5")




Model saved as wsl_model_20250822-193536.h5


In [37]:
# Load trained model and label map
model = tf.keras.models.load_model('/home/smayan/Desktop/ASL/no face/main_wsl_model_20250822-192224.h5')
label_map = np.load('/home/smayan/Desktop/ASL/no face/label_map.npy', allow_pickle=True).item()
actions = list(label_map.keys())

# Variables for prediction
sequence = []
sequence_length = 30
threshold = 0.7

# Start webcam / video
cap = cv.VideoCapture('/home/smayan/Desktop/ASL/ssvid.net--COLLEGE_1080p.mp4')

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Resize for consistency
        frame = cv.resize(frame, (640, 480))

        # Detection
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Extract keypoints (same as training)
        keypoints = extract_keypoints(results)

        # Append to sequence
        sequence.append(keypoints)
        sequence = sequence[-sequence_length:]

        if len(sequence) == sequence_length:
            input_seq = np.expand_dims(sequence, axis=0)   # shape (1, 30, 1530)

            # Predict
            res = model.predict(input_seq, verbose=0)[0]
            print("res shape:", res.shape)
            print("argmax:", np.argmax(res))

            predicted_action = actions[np.argmax(res)]
            confidence = np.max(res)

            # Show prediction if above threshold
            if confidence > threshold:
                cv.putText(image, f'{predicted_action}: {confidence:.2f}',
                           (10, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Show probabilities
            for i, (action, prob) in enumerate(zip(actions, res)):
                y_pos = 100 + i * 30
                cv.rectangle(image, (10, y_pos), (int(prob * 300) + 10, y_pos + 25), (0, 255, 0), -1)
                cv.putText(image, f'{action}: {prob:.2f}', (15, y_pos + 18),
                           cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)

        # Show output
        cv.imshow('ASL Inference', image)

        # Quit
        if cv.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv.destroyAllWindows()
    

I0000 00:00:1755871992.021573  296838 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1755871992.060794  344366 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.172.08), renderer: NVIDIA GeForce RTX 4070 SUPER/PCIe/SSE2
W0000 00:00:1755871992.102108  344339 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755871992.120030  344347 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755871992.121304  344361 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755871992.121331  344356 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000

res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 1
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape: (18,)
argmax: 11
res shape:

In [31]:
print("Model output shape:", model.output_shape)  # e.g. (None, 10)
print("Number of actions:", len(actions))
print("Actions:", actions)


Model output shape: (None, 18)
Number of actions: 18
Actions: ['hello', 'student', 'i', 'bye', 'goodbye', 'college', 'wrong', 'how', 'work', 'your', 'want', 'nice', 'to', 'meet', 'doctor', 'time', 'age', 'breakfast']
