In [2]:
import cv2
import numpy as np
import mediapipe as mp

mp_holistic = mp.solutions.holistic

# Extract keypoints from frame
def extract_landmarks(results):
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33 * 3)
    left = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    right = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([pose, left, right])  # total 225 features


In [3]:
def extract_sequence_from_video(video_path, max_frames=110):
    cap = cv2.VideoCapture(video_path)
    sequence = []

    with mp_holistic.Holistic(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
            results = holistic.process(image)
            image.flags.writeable = True

            keypoints = extract_landmarks(results)
            sequence.append(keypoints)

            if len(sequence) >= max_frames:
                break

    cap.release()
    return np.array(sequence)


In [3]:
sequence = extract_sequence_from_video("D:/code/Mini/Final data/40. I/MVI_0001.MOV")
print(sequence.shape)  # should be (<=110, 225)


(66, 225)


In [None]:
import os
from tqdm import tqdm

DATASET_PATH = 'D:/code/Mini/Final data'
OUTPUT_PATH = 'D:/code/Mini/pro v5/extracted_sequences'
MAX_FRAMES = 110  # maximum frame length to extract

# Create output folder if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

for word in os.listdir(DATASET_PATH):
    word_path = os.path.join(DATASET_PATH, word)
    save_path = os.path.join(OUTPUT_PATH, word)
    os.makedirs(save_path, exist_ok=True)

    for video_file in tqdm(os.listdir(word_path), desc=f"Processing '{word}'"):
        video_path = os.path.join(word_path, video_file)

        try:
            sequence = extract_sequence_from_video(video_path, max_frames=MAX_FRAMES)
            filename = os.path.splitext(video_file)[0] + '.npy'
            np.save(os.path.join(save_path, filename), sequence)
        except Exception as e:
            print(f"Error processing {video_file}: {e}")


'MAX_FRAMES = 110  # maximum frame length to extract\n\n# Create output folder if it doesn\'t exist\nos.makedirs(OUTPUT_PATH, exist_ok=True)\n\nfor word in os.listdir(DATASET_PATH):\n    word_path = os.path.join(DATASET_PATH, word)\n    save_path = os.path.join(OUTPUT_PATH, word)\n    os.makedirs(save_path, exist_ok=True)\n\n    for video_file in tqdm(os.listdir(word_path), desc=f"Processing \'{word}\'"):\n        video_path = os.path.join(word_path, video_file)\n\n        try:\n            sequence = extract_sequence_from_video(video_path, max_frames=MAX_FRAMES)\n            filename = os.path.splitext(video_file)[0] + \'.npy\'\n            np.save(os.path.join(save_path, filename), sequence)\n        except Exception as e:\n            print(f"Error processing {video_file}: {e}")'

In [8]:
MAX_FRAMES = 110  # maximum frame length to extract
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

X, y = [], []
labels = []

for word in os.listdir(OUTPUT_PATH):
    word_path = os.path.join(OUTPUT_PATH, word)
    for file in os.listdir(word_path):
        sequence = np.load(os.path.join(word_path, file))

        # Pad/truncate
        if sequence.shape[0] < MAX_FRAMES:
            pad_len = MAX_FRAMES - sequence.shape[0]
            padding = np.zeros((pad_len, 225))
            sequence = np.vstack((sequence, padding))
        else:
            sequence = sequence[:MAX_FRAMES]

        X.append(sequence)
        y.append(word)

X = np.array(X)
y = np.array(y)
print("X shape:", X.shape)  # should be (num_samples, 110, 225)
print("y shape:", y.shape)


X shape: (2198, 110, 225)
y shape: (2198,)


In [10]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

print("Classes:", label_encoder.classes_)  # list of your 50 words


Classes: ['18. City' '19. House' '20. Street or Road' '21. Train Station'
 '22. Restaurant' '23. Court' '24. School' '25. Office' '26. University'
 '27. Park' '39. Key' '40. I' '40. Paint' '41. Letter' '41. you'
 '42. Paper' '42. he' '43. Lock' '43. she' '44. Telephone' '44. it'
 '45. Bag' '45. we' '46. Box' '46. you (plural)' '47. Gift' '47. they'
 '48. Card' '48. Hello' '49. How are you' '49. Ring' '50. Alright'
 '50. Tool' '51. Good Morning' '52. Good afternoon' '58. Son'
 '59. Daughter' '60. Mother' '61. Father' '62. Parent' '63. Baby'
 '64. Man' '65. Woman' '66. Brother' '67. Sister' '84. Teacher'
 '85. Student' '86. Lawyer' '87. Doctor' '88. Patient' '89. Waiter'
 '90. Secretary' '91. Priest']


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Masking
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Masking(mask_value=0.0, input_shape=(110, 225)))  # mask padded zeros
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.0005), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(**kwargs)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, stratify=y, random_state=42)

history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test),
                    epochs=60,  # feel free to tune
                    batch_size=32)


Epoch 1/60
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 90ms/step - accuracy: 0.0132 - loss: 3.9992 - val_accuracy: 0.0318 - val_loss: 3.9652
Epoch 2/60
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step - accuracy: 0.0199 - loss: 3.9636 - val_accuracy: 0.0227 - val_loss: 3.9570
Epoch 3/60
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 80ms/step - accuracy: 0.0269 - loss: 3.9464 - val_accuracy: 0.0477 - val_loss: 3.7876
Epoch 4/60
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 80ms/step - accuracy: 0.0324 - loss: 3.7739 - val_accuracy: 0.0477 - val_loss: 3.6009
Epoch 5/60
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 82ms/step - accuracy: 0.0563 - loss: 3.6082 - val_accuracy: 0.0818 - val_loss: 3.3408
Epoch 6/60
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 81ms/step - accuracy: 0.0796 - loss: 3.3865 - val_accuracy: 0.1159 - val_loss: 3.1346
Epoch 7/60
[1m55/55[0m [32m━━━━

In [13]:
model.save("isl_bilstm_model_v2.h5")




In [14]:
import pickle

with open("label_encoder_v2.pkl", "wb") as f:
    pickle.dump(label_encoder, f)


In [15]:
from tensorflow.keras.models import load_model
import pickle
# Load model and encoder
model = load_model("D:/code/Mini/pro v5/isl_bilstm_model_v2.h5")

with open("D:/code/Mini/pro v5/label_encoder_v2.pkl", "rb") as f:
    label_encoder = pickle.load(f)




In [16]:
def predict_sign(video_path):
    sequence = extract_sequence_from_video(video_path, max_frames=110)

    # Pad or truncate
    if sequence.shape[0] < 110:
        pad_len = 117 - sequence.shape[0]
        sequence = np.vstack((sequence, np.zeros((pad_len, 225))))
    else:
        sequence = sequence[:110]

    sequence = np.expand_dims(sequence, axis=0)  # shape: (1, 117, 225)
    prediction = model.predict(sequence)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])

    return predicted_label[0]


In [19]:
predicted_word = predict_sign("D:/code/Mini/Final data/47. they/MVI_0026.MOV")
print("Predicted Word:", predicted_word)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Predicted Word: 47. they
