In [1]:
import cv2
import numpy as np
import torchvision.transforms as transforms
import pickle
import os
import mediapipe as mp
import csv
import random
import torch
import matplotlib.pyplot as plt
import torch.nn as nn

In [None]:
checkpoints_folder_path = os.path.join(os.getcwd(), 'checkpoint', 'landmark')

In [None]:
le = pickle.load(open(os.path.join(checkpoints_folder_path, 'label_encoder.pkl'), 'rb'))
num_classes = len(le.classes_)


In [None]:
# === 2. MLP Model ===

class SignLanguageMLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SignLanguageMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [None]:
if os.path.exists(os.path.join(checkpoints_folder_path,'best_model.pth')):
    # Load the model
    model = SignLanguageMLP(input_size=63, num_classes=num_classes)
    model.load_state_dict(torch.load(os.path.join(checkpoints_folder_path, 'best_model.pth'), map_location=torch.device('cpu')))
    model.eval()

## UTILS

In [None]:

# Preprocessing function to extract landmarks
def extract_hand_landmarks(results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])
            return landmarks
    return None

def center_landmarks(landmarks):
    """
    Center and scale hand landmarks around the wrist (landmark 0).

    Parameters:
        landmarks (list or np.ndarray): Flat list or array of 63 values (21 landmarks * 3 coordinates)

    Returns:
        np.ndarray: Preprocessed landmarks, same shape (63,)
    """
    # Convert to numpy array
    landmarks = np.array(landmarks)

    # Reshape to (21, 3)
    landmarks = landmarks.reshape((21, 3))

    # Step 1: Centering - subtract wrist coordinates
    wrist = landmarks[0]
    centered = landmarks - wrist

    # Step 2: Scaling - normalize by maximum distance from wrist
    # Compute Euclidean distances from wrist to each point
    distances = np.linalg.norm(centered, axis=1)
    max_distance = distances.max()

    # To avoid division by zero (if max_distance is 0)
    if max_distance > 0:
        scaled = centered / max_distance
    else:
        scaled = centered  # if hand is not detected well, just keep centered

    # Flatten back to (63,)
    return scaled.flatten()

In [None]:
# Setup
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

# Start video capture
cap = cv2.VideoCapture(0)

with mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7, max_num_hands=2) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.flip(frame, 1)
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb_frame)

        if results.multi_hand_landmarks:
            hand_landmarks = results.multi_hand_landmarks[0]
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            landmarks = extract_hand_landmarks(results)
            if landmarks:
                # Center and scale landmarks
                landmarks = center_landmarks(landmarks)
                features = torch.tensor(landmarks, dtype=torch.float32).unsqueeze(0)

                key = cv2.waitKey(10) & 0xFF
                if key == ord('q'):
                    break
                else:
                    # Make prediction
                    with torch.no_grad():
                        output = model(features)
                        _, predicted = torch.max(output, 1)
                        predicted_label = predicted.item()
                        predicted_label = le.inverse_transform([predicted_label])[0]
                    # Display prediction
                    cv2.putText(frame, f'Predicted: {predicted_label}', (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                    cv2.imshow('LIS alphabet', frame)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()
print("Data collection finished!")


I0000 00:00:1744562585.001270   20319 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744562585.005143   20463 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: AMD Radeon Graphics (radeonsi, renoir, LLVM 19.1.1, DRM 3.59, 6.11.0-19-generic)
W0000 00:00:1744562585.030738   20451 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1744562585.066582   20452 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Data collection finished!
