In [2]:
!pip install mediapipe

Collecting mediapipe
  Using cached mediapipe-0.10.21-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting absl-py (from mediapipe)
  Using cached absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting flatbuffers>=2.0 (from mediapipe)
  Using cached flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting jax (from mediapipe)
  Downloading jax-0.6.1-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib (from mediapipe)
  Downloading jaxlib-0.6.1-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting numpy<2 (from mediapipe)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting opencv-contrib-python (from mediapipe)
  Using cached opencv_contrib_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Using cached protobuf-4.25.7-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting soun

In [None]:
import cv2
import torch
import numpy as np
import mediapipe as mp
import pathlib

# Paths
asl_dataset_path = '/exchange/dspro2/silent-speech/ASL_Pictures_Dataset'
output_path = '/exchange/dspro2/silent-speech/ASL_Landmarks_Dataset'

# Ensure output folder exists and has the right structure
subsets = ['Train', 'Validation', 'Test']
for subset in subsets:
    (pathlib.Path(output_path) / subset).mkdir(parents=True, exist_ok=True)

# MediaPipe setup
mp_hands = mp.solutions.hands

print("Started processing")

# Process all images in the split dataset
with mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.7) as hands:
    for subset in subsets:
        subset_path = pathlib.Path(asl_dataset_path) / subset
        for label_folder in subset_path.iterdir():
            if not label_folder.is_dir():
                continue

            label = label_folder.name
            image_paths = list(label_folder.glob("*.jpg")) + list(label_folder.glob("*.jpeg")) + list(label_folder.glob("*.png"))
            total_images = len(image_paths)
            processed_images = 0

            X, y = [], []

            for idx, image_path in enumerate(image_paths):
                processed_images += 1
                if processed_images % 2000 == 0 or processed_images == total_images:
                    print(f"[{subset}/{label}] Processed {processed_images}/{total_images} images...")

                image = cv2.imread(str(image_path))
                if image is None:
                    continue

                # Flip every second image to simulate left-handedness
                if idx % 2 == 0:
                    image = cv2.flip(image, 1)

                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                results = hands.process(image_rgb)

                if results.multi_hand_landmarks:
                    landmarks = results.multi_hand_landmarks[0].landmark
                    coordinates = np.array([(lm.x, lm.y, lm.z) for lm in landmarks]).flatten()
                    X.append(coordinates)
                    y.append(label)

            # Save per label per subset
            X = np.array(X)
            y = np.array(y)
            out_folder = pathlib.Path(output_path) / subset
            np.save(out_folder / f"X_{label}.npy", X)
            np.save(out_folder / f"y_{label}.npy", y)

print(f"✅ Landmark conversion complete. Data saved in {output_path}")


Started processing


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1747990059.855955    7832 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1747990059.875990    7832 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1747990060.004098    7810 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


[Train/A] Processed 2000/3565 images...
