In [2]:
import cv2
import numpy as np
import mediapipe as mp

mp_holistic = mp.solutions.holistic

# Extract keypoints from frame
def extract_landmarks(results):
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33 * 3)
    left = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    right = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([pose, left, right])  # total 225 features


In [3]:
def extract_sequence_from_video(video_path, max_frames=117):
    cap = cv2.VideoCapture(video_path)
    sequence = []

    with mp_holistic.Holistic(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
            results = holistic.process(image)
            image.flags.writeable = True

            keypoints = extract_landmarks(results)
            sequence.append(keypoints)

            if len(sequence) >= max_frames:
                break

    cap.release()
    return np.array(sequence)


In [6]:
sequence = extract_sequence_from_video("D:/code/Mini/data/13. thick/MVI_9605.MOV")
print(sequence.shape)  # should be (<=117, 225)


(58, 225)


In [None]:
import os
from tqdm import tqdm

DATASET_PATH = 'D:/code/Mini/data'
OUTPUT_PATH = 'D:/code/Mini/pro v4/extracted_sequences'
MAX_FRAMES = 117  # maximum frame length to extract

# Create output folder if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

for word in os.listdir(DATASET_PATH):
    word_path = os.path.join(DATASET_PATH, word)
    save_path = os.path.join(OUTPUT_PATH, word)
    os.makedirs(save_path, exist_ok=True)

    for video_file in tqdm(os.listdir(word_path), desc=f"Processing '{word}'"):
        video_path = os.path.join(word_path, video_file)

        try:
            sequence = extract_sequence_from_video(video_path, max_frames=MAX_FRAMES)
            filename = os.path.splitext(video_file)[0] + '.npy'
            np.save(os.path.join(save_path, filename), sequence)
        except Exception as e:
            print(f"Error processing {video_file}: {e}")


'\nfor word in os.listdir(DATASET_PATH):\n    word_path = os.path.join(DATASET_PATH, word)\n    save_path = os.path.join(OUTPUT_PATH, word)\n    os.makedirs(save_path, exist_ok=True)\n\n    for video_file in tqdm(os.listdir(word_path), desc=f"Processing \'{word}\'"):\n        video_path = os.path.join(word_path, video_file)\n\n        try:\n            sequence = extract_sequence_from_video(video_path, max_frames=MAX_FRAMES)\n            filename = os.path.splitext(video_file)[0] + \'.npy\'\n            np.save(os.path.join(save_path, filename), sequence)\n        except Exception as e:\n            print(f"Error processing {video_file}: {e}")\n'

In [5]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

X, y = [], []
labels = []

for word in os.listdir(OUTPUT_PATH):
    word_path = os.path.join(OUTPUT_PATH, word)
    for file in os.listdir(word_path):
        sequence = np.load(os.path.join(word_path, file))

        # Pad/truncate
        if sequence.shape[0] < MAX_FRAMES:
            pad_len = MAX_FRAMES - sequence.shape[0]
            padding = np.zeros((pad_len, 225))
            sequence = np.vstack((sequence, padding))
        else:
            sequence = sequence[:MAX_FRAMES]

        X.append(sequence)
        y.append(word)

X = np.array(X)
y = np.array(y)
print("X shape:", X.shape)  # should be (num_samples, 117, 225)
print("y shape:", y.shape)


X shape: (1359, 117, 225)
y shape: (1359,)


In [6]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

print("Classes:", label_encoder.classes_)  # list of your 50 words


Classes: ['1. loud' '10. Mean' '11. rich' '12. poor' '13. thick' '17. flat'
 '18. curved' '19. male' '2. quiet' '20. female' '23. high' '24. low'
 '25. soft' '26. hard' '27. deep' '28. shallow' '29. clean' '3. happy'
 '30. dirty' '31. strong' '32. weak' '33. dead' '34. alive' '35. heavy'
 '36. light' '37. Hat' '38. Dress' '39. Suit' '39. famous' '4. sad'
 '40. Skirt' '41. Shirt' '42. T-Shirt' '43. Pant' '44. Shoes' '45. Pocket'
 '46. Clothing' '5. Beautiful' '6. Ugly' '7. Deaf' '78. long' '79. short'
 '8. Blind' '80. tall' '81. wide' '82. narrow' '83. big large'
 '84. small little' '85. slow' '86. fast']


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Masking
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Masking(mask_value=0.0, input_shape=(117, 225)))  # mask padded zeros
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.0005), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(**kwargs)


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, stratify=y, random_state=42)

history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test),
                    epochs=60,  # feel free to tune
                    batch_size=32)


Epoch 1/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 148ms/step - accuracy: 0.0170 - loss: 3.9418 - val_accuracy: 0.0551 - val_loss: 3.8936
Epoch 2/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 285ms/step - accuracy: 0.0429 - loss: 3.8790 - val_accuracy: 0.0699 - val_loss: 3.7897
Epoch 3/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 99ms/step - accuracy: 0.0630 - loss: 3.7877 - val_accuracy: 0.0882 - val_loss: 3.5616
Epoch 4/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 95ms/step - accuracy: 0.0890 - loss: 3.5350 - val_accuracy: 0.1029 - val_loss: 3.3348
Epoch 5/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 143ms/step - accuracy: 0.1076 - loss: 3.3571 - val_accuracy: 0.1507 - val_loss: 3.0720
Epoch 6/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 259ms/step - accuracy: 0.1511 - loss: 3.0748 - val_accuracy: 0.1875 - val_loss: 2.9089
Epoch 7/60
[1m34/34[0m [3

In [9]:
model.save("isl_bilstm_model_v1.h5")




In [11]:
import pickle

with open("label_encoder_v1.pkl", "wb") as f:
    pickle.dump(label_encoder, f)


Test


In [12]:
from tensorflow.keras.models import load_model
import pickle
# Load model and encoder
model = load_model("D:/code/Mini/pro v4/isl_bilstm_model_v1.h5")

with open("D:/code/Mini/pro v4/label_encoder_v1.pkl", "rb") as f:
    label_encoder = pickle.load(f)




In [13]:
def predict_sign(video_path):
    sequence = extract_sequence_from_video(video_path, max_frames=117)

    # Pad or truncate
    if sequence.shape[0] < 117:
        pad_len = 117 - sequence.shape[0]
        sequence = np.vstack((sequence, np.zeros((pad_len, 225))))
    else:
        sequence = sequence[:117]

    sequence = np.expand_dims(sequence, axis=0)  # shape: (1, 117, 225)
    prediction = model.predict(sequence)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])

    return predicted_label[0]


In [None]:
predicted_word = predict_sign("D:\code\Mini\data\6. Ugly\MVI_9578 - Copy (2).MOV")
print("Predicted Word:", predicted_word)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Predicted Word: 5. Beautiful
