In [1]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.layers import Layer
from sklearn.preprocessing import LabelEncoder

# 1. Load Images

In [2]:
def load_images_from_folder(folder):
    images = []
    labels = []
    for label in os.listdir(folder):
        dir_path = os.path.join(folder, label)
        for image in os.listdir(dir_path):
            img_path = os.path.join(dir_path, image)
            img = cv2.imread(img_path)
            img = cv2.resize(img, (240, 240))  # Resize all images to 540x960
            images.append(img)
            labels.append(label)
    return images, labels

dataset_folder = "../datasets/Filtered Raw Images"
images, labels = load_images_from_folder(dataset_folder)

In [3]:
len(images)

4557

In [6]:
# Ensure images and labels are arrays
images = np.array(images)
labels = np.array(labels)

display(images.shape, labels.shape)

(4557, 960, 540, 3)

(4557,)

In [7]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)
display(len(X_train), len(X_test))

3645

912

# 2. Mediapipe Integration

In [6]:
mp_hands = mp.solutions.hands
# Initialize hand landmark detection with specific parameters
hands = mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5, max_num_hands=2)

def detect_landmarks(image):
    landmarks_coordinate = []
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            for idx, landmark in enumerate(hand_landmarks.landmark):
                print(f'x{idx} = {landmark.x}')
                landmarks_coordinate.append(landmark.x)
                print(f'y{idx} = {landmark.y}')
                landmarks_coordinate.append(landmark.y)
                print(f'z{idx} = {landmark.z}')
                landmarks_coordinate.append(landmark.z)
    return landmarks_coordinate
                

# Example usage
example_image = X_train[0]
landmarks = detect_landmarks(example_image)
print(landmarks)


x0 = 0.7126893997192383
y0 = 0.5958437919616699
z0 = -2.822761757670378e-07
x1 = 0.7068435549736023
y1 = 0.5655960440635681
z1 = 0.0005654096021316946
x2 = 0.6719934940338135
y2 = 0.5394784808158875
z2 = -0.006420113146305084
x3 = 0.6180561780929565
y3 = 0.5358845591545105
z3 = -0.014703338965773582
x4 = 0.5825240612030029
y4 = 0.5438482165336609
z4 = -0.024391671642661095
x5 = 0.6728824973106384
y5 = 0.5193976759910583
z5 = -0.014605844393372536
x6 = 0.5981832146644592
y6 = 0.5157224535942078
z6 = -0.0312117338180542
x7 = 0.5640261769294739
y7 = 0.5215676426887512
z7 = -0.04208071902394295
x8 = 0.5449411273002625
y8 = 0.5251739025115967
z8 = -0.04884830862283707
x9 = 0.6578308343887329
y9 = 0.5311128497123718
z9 = -0.023794526234269142
x10 = 0.5807985067367554
y10 = 0.5289844870567322
z10 = -0.03685164824128151
x11 = 0.5745165348052979
y11 = 0.5382302403450012
z11 = -0.04019805043935776
x12 = 0.579787015914917
y12 = 0.5417948961257935
z12 = -0.04272283613681793
x13 = 0.642976999282836



# 3. Custom Layer untuk DNN

## 3.1. Version 1

In [174]:
# Define the LandmarksDetector layer
class LandmarksDetector(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(LandmarksDetector, self).__init__(**kwargs)

    def call(self, inputs):
        all_landmarks_coordinate = []
        for image in inputs:
            skip = False
            landmarks_coordinate = []
            image = tf.cast(image, tf.uint8)
            # image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image_rgb = image[..., ::-1]  # Convert BGR to RGB
            image_rgb = tf.image.convert_image_dtype(image_rgb, dtype=tf.uint8)

            #---------------------------------------------------------------------
            results = hands.process(image_rgb.numpy())

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    print(len(hand_landmarks.landmark))
                    if len(hand_landmarks.landmark) == 21 or len(hand_landmarks.landmark) == 42:
                        for idx, landmark in enumerate(hand_landmarks.landmark):
                            landmarks_coordinate.append(landmark.x)
                            landmarks_coordinate.append(landmark.y)
                            landmarks_coordinate.append(landmark.z)
                    else:
                        skip = True
                if len(landmarks_coordinate) > 63 and not skip:
                    all_landmarks_coordinate.append(landmarks_coordinate[:63])
                    all_landmarks_coordinate.append(landmarks_coordinate[63:])
                elif not skip:
                    all_landmarks_coordinate.append(landmarks_coordinate)
        return tf.convert_to_tensor(all_landmarks_coordinate, dtype=tf.float32)
        # return all_landmarks_coordinate

## 3.2. Version 2

In [184]:
class LandmarksDetectorV2(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(LandmarksDetectorV2, self).__init__(**kwargs)
        self.holistic = mp.solutions.holistic.Holistic(static_image_mode=True)

    def call(self, inputs):
        def get_landmarks(image):
            image = tf.cast(image, tf.uint8)
            image_rgb = image[..., ::-1]  # Convert BGR to RGB
            image_rgb = tf.image.convert_image_dtype(image_rgb, dtype=tf.uint8)

            def _get_landmarks(image_rgb_np):
                results = self.holistic.process(image_rgb_np)
                if results.pose_landmarks:
                    landmarks = [(lm.x, lm.y, lm.z) for lm in results.pose_landmarks.landmark]
                    return np.array(landmarks, dtype=np.float32).flatten()
                else:
                    return np.zeros(33 * 3, dtype=np.float32)

            landmarks = tf.numpy_function(_get_landmarks, [image_rgb], tf.float32)
            
            return landmarks

        landmarks = tf.map_fn(get_landmarks, inputs, dtype=tf.float32)
        return landmarks


## 3.3. Version 3

In [8]:
class LandmarksDetectorV3(tf.keras.layers.Layer):
    def __init__(self, scaler_path, **kwargs):
        super(LandmarksDetectorV3, self).__init__(**kwargs)
        self.mp_hands = mp.solutions.hands
        self.hands = self.mp_hands.Hands()
        self.scaler = joblib.load(scaler_path)  # Load the scaler

    def call(self, inputs):
        def get_landmarks(image):
            image = tf.cast(image, tf.uint8)
            image_rgb = image[..., ::-1]  # Convert BGR to RGB
            image_rgb = tf.image.convert_image_dtype(image_rgb, dtype=tf.uint8)

            def _get_landmarks(image_rgb_np):
                results = self.hands.process(image_rgb_np)
                landmarks_coordinate = np.zeros(63, dtype=np.float32)  # Fixed size array of zeros
                if results.multi_hand_landmarks:
                    hand_landmarks = results.multi_hand_landmarks[0]
                    for idx, landmark in enumerate(hand_landmarks.landmark):
                        if idx < 21:  # We only consider the first 21 landmarks
                            landmarks_coordinate[idx * 3] = landmark.x
                            landmarks_coordinate[idx * 3 + 1] = landmark.y
                            landmarks_coordinate[idx * 3 + 2] = landmark.z
                return landmarks_coordinate

            landmarks = tf.numpy_function(_get_landmarks, [image_rgb], tf.float32)
            landmarks.set_shape([63])  # Explicitly set the shape of the output tensor
            return landmarks

        landmarks = tf.map_fn(get_landmarks, inputs, dtype=tf.float32)
        landmarks.set_shape([inputs.shape[0], 63])  # Explicitly set the shape of the output tensor

        # Apply the scaler to the landmarks
        def apply_scaler(landmarks_np):
            return self.scaler.transform(landmarks_np.reshape(1, -1)).flatten()

        scaled_landmarks = tf.numpy_function(apply_scaler, [landmarks], tf.float32)
        scaled_landmarks.set_shape([inputs.shape[0], 63])  # Ensure the shape is correct after scaling

        return scaled_landmarks

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 63)

In [17]:
image = X_train[0][0][0]
image_rgb = image[..., ::-1]  # Convert BGR to RGB
image_rgb = tf.image.convert_image_dtype(image_rgb, dtype=tf.uint8)

display(image, image_rgb)

array([10,  1,  1], dtype=uint8)

<tf.Tensor: shape=(3,), dtype=uint8, numpy=array([ 1,  1, 10], dtype=uint8)>

In [18]:
# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

# Load an example image
image = X_train[:10]
# print(type(image), image.shape)

# Expand dimensions to create a batch of size 1
image_batch = tf.convert_to_tensor(image)
# image_batch = tf.expand_dims(image, 0)
print(type(image_batch), image_batch.shape)

# Create an instance of the LandmarksDetector layer
landmarks_detector = LandmarksDetectorV3()

# Pass the image through the LandmarksDetector layer
landmarks = landmarks_detector(image_batch)

# Convert the landmarks tensor to a NumPy array for inspection
# landmarks_array = landmarks
landmarks_array = landmarks.numpy()

# Print the shape of the extracted landmarks
print("Extracted Landmarks Shape:", landmarks_array.shape)
print("Extracted Landmarks type:", type(landmarks_array))
print("Extracted Landmarks:", landmarks_array)

# Don't forget to close the hands instance
hands.close()


<class 'tensorflow.python.framework.ops.EagerTensor'> (10, 960, 540, 3)


  return py_builtins.overload_of(f)(*args)


Extracted Landmarks Shape: (10, 63)
Extracted Landmarks type: <class 'numpy.ndarray'>
Extracted Landmarks: [[ 6.50429189e-01  7.65751958e-01  5.50131404e-07  6.11938357e-01
   7.33383536e-01 -1.35441273e-02  5.35249114e-01  7.22817481e-01
  -2.79281605e-02  4.65442121e-01  7.31120050e-01 -3.91142108e-02
   4.14158881e-01  7.32273340e-01 -5.05237058e-02  5.13722479e-01
   7.15550065e-01 -5.68768531e-02  4.23396856e-01  7.09593713e-01
  -8.50891545e-02  3.65506083e-01  7.05735862e-01 -9.82945263e-02
   3.19084823e-01  7.02936769e-01 -1.04017541e-01  5.05995333e-01
   7.45270610e-01 -5.81868589e-02  4.07936394e-01  7.56167054e-01
  -8.43833536e-02  3.41932833e-01  7.66833425e-01 -9.30837318e-02
   2.91373193e-01  7.76903272e-01 -9.70777795e-02  5.10187984e-01
   7.75411189e-01 -5.61929680e-02  4.48203862e-01  7.83267856e-01
  -7.50082582e-02  4.74850684e-01  7.78621018e-01 -5.97336926e-02
   5.01763761e-01  7.75916219e-01 -4.30598781e-02  5.23284853e-01
   8.01351249e-01 -5.47791123e-02  

In [194]:
print("Extracted Landmarks:", len(landmarks_array[0]))

Extracted Landmarks: 63


# 4. Training Model with Custom Layer

In [9]:
num_classes = len(np.unique(y_train))

In [22]:
# Define your DNN model
def create_model(num_classes):
    inputs = tf.keras.Input(shape=(None, None, None))
    print(inputs, end="\n\n")
    x = LandmarksDetectorV3()(inputs)
    print(x, end="\n\n")
    x = tf.keras.layers.Flatten()(x)
    print(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model



# Initialize the model
model = create_model(num_classes)

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()
# Train the model


KerasTensor(type_spec=TensorSpec(shape=(None, None, None, None), dtype=tf.float32, name='input_6'), name='input_6', description="created by layer 'input_6'")

KerasTensor(type_spec=TensorSpec(shape=(None, 63), dtype=tf.float32, name=None), name='landmarks_detector_v3_5/map/TensorArrayV2Stack/TensorListStack:0', description="created by layer 'landmarks_detector_v3_5'")

KerasTensor(type_spec=TensorSpec(shape=(None, 63), dtype=tf.float32, name=None), name='flatten_3/Reshape:0', description="created by layer 'flatten_3'")
Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, None, None, Non   0         
                             e)]                                 
                                                                 
 landmarks_detector_v3_5 (L  (None, 63)                0         
 andmarksDetectorV3)                                          

In [23]:
# Encode string labels to integer labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
np.unique(y_train_encoded)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64)

In [24]:
# Assuming you have your dataset and labels prepared
train_images = X_train
train_labels = y_train_encoded
model.fit(train_images, train_labels, epochs=10, batch_size=32)

# Don't forget to close the hands instance
hands.close()

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


NameError: name 'hands' is not defined

In [27]:
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test_encoded)

# Print the test loss and accuracy
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")



Test Loss: 1.261136770248413
Test Accuracy: 0.5866228342056274


In [33]:
!pip install tf2onnx

Collecting tf2onnx
  Downloading tf2onnx-1.16.1-py3-none-any.whl.metadata (1.3 kB)
Downloading tf2onnx-1.16.1-py3-none-any.whl (455 kB)
   ---------------------------------------- 0.0/455.8 kB ? eta -:--:--
   -------- ------------------------------- 92.2/455.8 kB 2.6 MB/s eta 0:00:01
   -------- ------------------------------- 92.2/455.8 kB 2.6 MB/s eta 0:00:01
   ------------------- -------------------- 225.3/455.8 kB 1.7 MB/s eta 0:00:01
   ------------------- -------------------- 225.3/455.8 kB 1.7 MB/s eta 0:00:01
   --------------------- ------------------ 245.8/455.8 kB 1.1 MB/s eta 0:00:01
   -------------------------- ------------- 307.2/455.8 kB 1.1 MB/s eta 0:00:01
   ------------------------------- -------- 358.4/455.8 kB 1.1 MB/s eta 0:00:01
   ---------------------------------- ----- 389.1/455.8 kB 1.1 MB/s eta 0:00:01
   ---------------------------------------- 455.8/455.8 kB 1.1 MB/s eta 0:00:00
Installing collected packages: tf2onnx
Successfully installed tf2onnx-1.16.



In [35]:
model.save('Handsign_detection')

INFO:tensorflow:Assets written to: Handsign_detection\assets


INFO:tensorflow:Assets written to: Handsign_detection\assets
