In [None]:
!pip install ultralytics
from ultralytics import YOLO

In [2]:
!pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.2-py3-none-any.whl.metadata (1.6 kB)
Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl (35.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.2-py3-none-any.whl (32 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling prot

In [3]:
import os
import mediapipe as mp
import cv2
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

2025-06-29 14:15:23.707842: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751206524.175211      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751206524.289399      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
KAGGLE_DATA_PATH = '/kaggle/input/include/Adjectives_1of8'

WORKING_DIR = '/kaggle/working/'
CROPPED_FRAMES_DIR = os.path.join(WORKING_DIR, 'yolo_cropped_frames')
LANDMARKS_DIR = os.path.join(WORKING_DIR, 'extracted_landmarks')

os.makedirs(CROPPED_FRAMES_DIR, exist_ok=True)
os.makedirs(LANDMARKS_DIR, exist_ok=True)

print(f"Kaggle Data Path: {KAGGLE_DATA_PATH}")
print(f"Working Directory: {WORKING_DIR}")

Kaggle Data Path: /kaggle/input/include/Adjectives_1of8
Working Directory: /kaggle/working/


#### Extracting Landmarks

In [10]:
file_paths = []

for root, dirs, files in os.walk(KAGGLE_DATA_PATH):
    for file in files:
        if file.endswith('.MOV'):
            file_path = os.path.join(root, file)
            path_components = file_path.split('/')    
            for component in reversed(path_components):
                if '. ' in component:
                 word = component.split('. ')[1]
            file_paths.append([word.lower(), file_path])

file_paths = np.array(file_paths)
file_paths.shape

(104, 2)

In [5]:
def extract_landmarks(frame, holistic_model, frame_number=None):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic_model.process(rgb_frame)

    expected_pose_size = 33 * 4
    expected_face_size = 468 * 3
    expected_hand_size = 21 * 3

    def get_landmarks_data(landmarks_obj, expected_size, include_visibility=False):
        if landmarks_obj:
            if include_visibility:
                data = np.array([[lmk.x, lmk.y, lmk.z, lmk.visibility] for lmk in landmarks_obj.landmark]).flatten()
            else:
                data = np.array([[lmk.x, lmk.y, lmk.z] for lmk in landmarks_obj.landmark]).flatten()
            return data
        return np.zeros(expected_size)

    pose_data = get_landmarks_data(results.pose_landmarks, expected_pose_size, include_visibility=True)
    face_data = get_landmarks_data(results.face_landmarks, expected_face_size)
    left_hand_data = get_landmarks_data(results.left_hand_landmarks, expected_hand_size)
    right_hand_data = get_landmarks_data(results.right_hand_landmarks, expected_hand_size)

    full_landmark_vector = np.concatenate([pose_data, face_data, left_hand_data, right_hand_data])
    
    return full_landmark_vector

In [6]:
mp_holistic = mp.solutions.holistic

def process_videos(file_paths):
    holistic_model = mp_holistic.Holistic(
        static_image_mode=False, 
        model_complexity=2,
        min_detection_confidence=0.5, 
        min_tracking_confidence=0.5
    )

    video_landmarks = []
    labels = []

    SEQUENCE_LENGTH = 60
    
    for i, (word, file_path) in enumerate(file_paths):
        print(f"Processing video {i+1}/{len(file_paths)}: {file_path} for landmarks (word: {word})")
        
        cap = cv2.VideoCapture(file_path)
        if not cap.isOpened():
            print(f"Error: Could not open video file {file_path}. Skipping.")
            continue

        current_video_landmarks = []
        frame_count = 0  

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            landmarks_frame = extract_landmarks(frame, holistic_model, frame_count)
            if landmarks_frame is not None:
                current_video_landmarks.append(landmarks_frame)

            frame_count += 1

        cap.release()

        if not current_video_landmarks:
            print(f"No valid frames or landmarks extracted from {file_path}. Skipping video.")
            continue

        video_sequence_array = np.array(current_video_landmarks)

        if video_sequence_array.shape[0] < SEQUENCE_LENGTH:
            padding_needed = SEQUENCE_LENGTH - video_sequence_array.shape[0]
            padded_sequence = np.pad(video_sequence_array, 
                                     ((0, padding_needed), (0, 0)), 
                                     mode='constant', constant_values=0)
            video_landmarks.append(padded_sequence)
        elif video_sequence_array.shape[0] > SEQUENCE_LENGTH:
            truncated_sequence = video_sequence_array[:SEQUENCE_LENGTH, :]
            video_landmarks.append(truncated_sequence)
        else:
            video_landmarks.append(video_sequence_array)
        
        labels.append(word)
                    
    video_landmarks = np.array(video_landmarks)
    labels = np.array(labels)

    np.save(os.path.join(LANDMARKS_DIR, 'pose_landmarks_landmarks.npy'), video_landmarks)
    np.save(os.path.join(LANDMARKS_DIR, 'pose_landmarks_labels.npy'), labels)

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(labels)

    print(f"\nSaved {video_landmarks.shape[0]} sequences and labels to {LANDMARKS_DIR}")
    print(f"Shape of extracted sequences (X): {video_landmarks.shape}")
    print(f"Shape of encoded labels (y): {y_encoded.shape}")

    holistic_model.close()

In [9]:
process_videos(file_paths)

Processing video 1/104: /kaggle/input/include/Adjectives_1of8/Adjectives/1. loud/MVI_9290.MOV for landmarks (word: loud)


W0000 00:00:1751204516.881070     120 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1751204517.068334     120 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1751204517.072744     122 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1751204517.075403     119 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1751204517.078152     121 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1751204517.108377     119 inference_feedback_manager.cc:114] Feedback manager 

Processing video 2/104: /kaggle/input/include/Adjectives_1of8/Adjectives/1. loud/MVI_5258.MOV for landmarks (word: loud)
Processing video 3/104: /kaggle/input/include/Adjectives_1of8/Adjectives/1. loud/MVI_9536.MOV for landmarks (word: loud)
Processing video 4/104: /kaggle/input/include/Adjectives_1of8/Adjectives/1. loud/MVI_5257.MOV for landmarks (word: loud)
Processing video 5/104: /kaggle/input/include/Adjectives_1of8/Adjectives/1. loud/MVI_5177.MOV for landmarks (word: loud)
Processing video 6/104: /kaggle/input/include/Adjectives_1of8/Adjectives/1. loud/MVI_5259.MOV for landmarks (word: loud)
Processing video 7/104: /kaggle/input/include/Adjectives_1of8/Adjectives/1. loud/MVI_9535.MOV for landmarks (word: loud)
Processing video 8/104: /kaggle/input/include/Adjectives_1of8/Adjectives/1. loud/MVI_9449.MOV for landmarks (word: loud)
Processing video 9/104: /kaggle/input/include/Adjectives_1of8/Adjectives/1. loud/MVI_9450.MOV for landmarks (word: loud)
Processing video 10/104: /kaggle

In [10]:
X = np.load(os.path.join(LANDMARKS_DIR, 'pose_landmarks_landmarks.npy'))
y = np.load(os.path.join(LANDMARKS_DIR, 'pose_landmarks_labels.npy'))

In [6]:
X = np.load("/kaggle/input/npy-files-input/pose_landmarks_landmarks.npy")
y = np.load("/kaggle/input/npy-files-input/pose_landmarks_labels.npy")

In [11]:
X[0]

array([[ 0.48630583,  0.35479909, -0.52718693, ...,  0.41376522,
         1.00503421,  0.0080107 ],
       [ 0.48621017,  0.35488755, -0.52777106, ...,  0.40040633,
         0.99176592,  0.01896957],
       [ 0.48612928,  0.35488304, -0.53528559, ...,  0.41364419,
         1.00260413,  0.00648379],
       ...,
       [ 0.48132306,  0.35261399, -0.5267334 , ...,  0.38702124,
         0.97029626,  0.01263577],
       [ 0.48132119,  0.35275587, -0.54307491, ...,  0.38727611,
         0.9713316 ,  0.01174397],
       [ 0.4813233 ,  0.35286936, -0.52748704, ...,  0.38933939,
         0.97365218,  0.01501061]])

In [8]:
X.shape

(104, 60, 1662)

In [9]:
y.shape

(104,)

In [25]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Conv1D, TimeDistributed, Flatten, MaxPooling1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
label_encoder = LabelEncoder()

# Fit label encoder on the labels and transform them
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Now, y_train_encoded and y_test_encoded should be arrays of integers
categories = label_encoder.classes_

# Print the categories
print("Encoded Categories:")
print(categories)

from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, TimeDistributed
from tensorflow.keras.layers import Reshape
from keras.optimizers import Adam
import tensorflow as tf

# Assuming your y_train_encoded and y_test_encoded are now integer encoded
num_classes = len(np.unique(y_train_encoded))

model = Sequential()
# model.add(TimeDistributed(Reshape((x_train.shape[2], 1)), input_shape=(x_train.shape[1], x_train.shape[2])))
# model.add(TimeDistributed(Conv1D(filters=64, kernel_size=3, activation='relu')))
# model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
# model.add(TimeDistributed(Flatten()))
model.add(LSTM(128, return_sequences=True), input_shape=(x_train.shape[1], x_train.shape[2]))
model.add(Dropout(0.5)) 
model.add(LSTM(64)) 
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3)) 
model.add(Dense(num_classes, activation='softmax'))

optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train_encoded, validation_data=(x_test, y_test_encoded), epochs=100, batch_size=8)

loss, accuracy = model.evaluate(x_test, y_test_encoded)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Encoded Categories:
['beautiful' 'blind' 'deaf' 'happy' 'loud' 'quiet' 'sad' 'ugly']
Epoch 1/100


  super().__init__(**kwargs)


RuntimeError: pybind11::error_already_set: MISMATCH of original and normalized active exception types: ORIGINAL ResourceExhaustedError REPLACED BY KeyboardInterrupt: <EMPTY MESSAGE>

At:
  /usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/errors_impl.py(377): __init__
  /usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/execute.py(53): quick_execute
  /usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/context.py(1683): call_function
  /usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py(251): call_flat
  /usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py(216): call_preflattened
  /usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/concrete_function.py(1322): _call_flat
  /usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py(919): _call
  /usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py(833): __call__
  /usr/local/lib/python3.11/dist-packages/tensorflow/python/util/traceback_utils.py(150): error_handler
  /usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py(219): function
  /usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py(371): fit
  /usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py(117): error_handler
  /tmp/ipykernel_35/2487387059.py(51): <cell line: 0>
  /usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py(3553): run_code
  /usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py(3473): run_ast_nodes
  /usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py(3257): run_cell_async
  /usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py(78): _pseudo_sync_runner
  /usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py(3030): _run_cell
  /usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py(2975): run_cell
  /usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py(528): run_cell
  /usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py(383): do_execute
  /usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py(730): execute_request
  /usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py(406): dispatch_shell
  /usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py(499): process_one
  /usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py(510): dispatch_queue
  /usr/lib/python3.11/asyncio/events.py(84): _run
  /usr/lib/python3.11/asyncio/base_events.py(1936): _run_once
  /usr/lib/python3.11/asyncio/base_events.py(608): run_forever
  /usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py(205): start
  /usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py(712): start
  /usr/local/lib/python3.11/dist-packages/traitlets/config/application.py(992): launch_instance
  /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py(37): <module>
  <frozen runpy>(88): _run_code
  <frozen runpy>(198): _run_module_as_main


In [None]:
model.save("demo_lstm_85.h5")