**INTRO TO HOW YOU USE THIS:**
1. WHEN YOU RUN THE SCRIPT IT WILL OPEN AN EXTERNAL CAMERA WINDOW – THIS IS WHAT WE ARE USING TO LOG THE DATA.
2. IF YOU WISH TO ENTER LOGGING MODE (WHERE YOU CAN COLLECT THE DATAPOINTS FOR THE HANDSIGN YOU'RE MAKING) PRESS "1"
3. MAKE THE HAND SIGN YOU WANT AND PRESS THE CORRESPONDING KEY ON THE KEYBOARD AND IT WILL BE LOGGED (A MESSAGE WILL APPEAR BRIEFLY INDICATING WHAT LETTER YOU PRESSED)
4. TO EXIT LOGGING MODE, PRESS "0"
5. TO EXIT THE PROGRAM ALTOGETHER, PRESS "ESC"

_A note on how the data is structured in the .csv: Each row represents one logged hand signal. That is, when you have the program open in logging mode and click a possible button. The following 42 columns are the respective X and Y 'coordinates' of each segment of the hand. The first two columns are always 0,0 as these are the palm node from which the relative position of all the other segments is calculated. While this may seem odd, it makes a lot more sense to calculate the finger position relatively as opposed to where they appear pixel wise in the image, as this is prone to change and would make the model very rigid and overiftting._

In [40]:
#pip install tensorflow, mediapipe, opencv-python

In [41]:
import csv
import copy
import itertools

import cv2 as cv
import numpy as np
import mediapipe as mp
import tensorflow as tf

In [42]:
class KeyPointClassifier(object):
    def __init__(
        self,
        model_path='./data/keypoint_classifier.tflite',
        num_threads=1,
    ):
        self.interpreter = tf.lite.Interpreter(model_path=model_path,
                                               num_threads=num_threads)

        self.interpreter.allocate_tensors()
        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()

    def __call__(
        self,
        landmark_list,
    ):
        input_details_tensor_index = self.input_details[0]['index']
        self.interpreter.set_tensor(
            input_details_tensor_index,
            np.array([landmark_list], dtype=np.float32))
        self.interpreter.invoke()

        output_details_tensor_index = self.output_details[0]['index']

        result = self.interpreter.get_tensor(output_details_tensor_index)

        result_index = np.argmax(np.squeeze(result))

        return result_index

In [43]:
def main():

    cap_device = 0
    cap_width = 960
    cap_height = 540

    use_brect = True

    cap = cv.VideoCapture(cap_device)
    cap.set(cv.CAP_PROP_FRAME_WIDTH, cap_width)
    cap.set(cv.CAP_PROP_FRAME_HEIGHT, cap_height)

    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(
        model_complexity=0,
        static_image_mode=True,
        max_num_hands=2,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.7
    )

    keypoint_classifier = KeyPointClassifier()

    with open('./data/keypoint_classifier_labels.csv', encoding='utf-8-sig') as f:
        keypoint_classifier_labels = csv.reader(f)
        keypoint_classifier_labels = [row[0] for row in keypoint_classifier_labels]

    mode = 0

    while True:
        key = cv.waitKey(10)
        if key == 27:
            break
        if key != -1:
            print()
        number, mode = select_mode(key, mode)

        ret, image = cap.read()
        if not ret:
            break
        image = cv.flip(image, 1)
        debug_image = copy.deepcopy(image)

        image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = hands.process(image)
        image.flags.writeable = True

        if results.multi_hand_landmarks is not None:
            for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
                brect = calc_bounding_rect(debug_image, hand_landmarks)
                landmark_list = calc_landmark_list(debug_image, hand_landmarks)

                pre_processed_landmark_list = pre_process_landmark(landmark_list)

                logging_csv(number, mode, pre_processed_landmark_list)

                hand_sign_id = keypoint_classifier(pre_processed_landmark_list)

                debug_image = draw_bounding_rect(use_brect, debug_image, brect)
                debug_image = draw_landmarks(debug_image, landmark_list)
                debug_image = draw_info_text(
                    debug_image,
                    brect,
                    handedness,
                    keypoint_classifier_labels[hand_sign_id],
                    ""
                )

        debug_image = draw_info(debug_image, mode, number)

        cv.imshow('Hand Gesture Recognition', debug_image)

    cap.release()
    cv.destroyAllWindows()


def logging_csv(number, mode, landmark_list): #Logs the selected number and landmark list to a CSV file if mode is 1.
 
    if mode == 0:
        # Mode 0: No logging
        pass
    elif mode == 1 and (0 <= number <= 25):  # Ensure number corresponds to A-Z (0-25)
        csv_path = './data/keypoint.csv'
        try:
            with open(csv_path, 'a', newline="") as f:
                writer = csv.writer(f)
                writer.writerow([chr(number+97), *landmark_list])
        except FileNotFoundError:
            print(f"Error: CSV path '{csv_path}' not found.")
        except Exception as e:
            print(f"Error while logging to CSV: {e}")
    return

def select_mode(key, mode):
    number = -1
    if ord('a') <= key <= ord('z'):  # A ~ Z
        number = key - 97  # Map ASCII A-Z to 0-25
    elif key == ord('0'):  # '0' key
        mode = 0 # nothing mode
    elif key == ord('1'):  # '1' key 
        mode = 1 # logging mode
    return number, mode

def pre_process_landmark(landmark_list):
    temp_landmark_list = copy.deepcopy(landmark_list)

    # Convert to relative coordinates
    base_x, base_y = 0, 0
    for index, landmark_point in enumerate(temp_landmark_list):
        if index == 0:
            base_x, base_y = landmark_point[0], landmark_point[1]

        temp_landmark_list[index][0] = temp_landmark_list[index][0] - base_x
        temp_landmark_list[index][1] = temp_landmark_list[index][1] - base_y

    # Convert to a one-dimensional list
    temp_landmark_list = list(
        itertools.chain.from_iterable(temp_landmark_list))

    # Normalization
    max_value = max(list(map(abs, temp_landmark_list)))

    def normalize_(n):
        return n / max_value

    temp_landmark_list = list(map(normalize_, temp_landmark_list))

    return temp_landmark_list

def draw_landmarks(image, landmark_point):
    def draw_line_pair(p1, p2):
        cv.line(image, tuple(p1), tuple(p2), (0, 0, 0), 6)
        cv.line(image, tuple(p1), tuple(p2), (255, 255, 255), 2)

    def draw_keypoint(pt, radius):
        cv.circle(image, tuple(pt), radius, (255, 255, 255), -1)
        cv.circle(image, tuple(pt), radius, (0, 0, 0), 1)

    if len(landmark_point) > 0:
        # Fingers
        finger_connections = [
            [2, 3, 4],     # Thumb
            [5, 6, 7, 8],  # Index
            [9, 10, 11, 12],  # Middle
            [13, 14, 15, 16],  # Ring
            [17, 18, 19, 20]   # Pinky
        ]
        for finger in finger_connections:
            for i in range(len(finger) - 1):
                draw_line_pair(landmark_point[finger[i]], landmark_point[finger[i + 1]])

        # Palm
        palm_connections = [
            (0, 1), (1, 2), (2, 5), (5, 9),
            (9, 13), (13, 17), (17, 0)
        ]
        for start, end in palm_connections:
            draw_line_pair(landmark_point[start], landmark_point[end])

    # Key points
    for index, landmark in enumerate(landmark_point):
        if index in [4, 8, 12, 16, 20]:  # Fingertips
            draw_keypoint(landmark, 8)
        else:
            draw_keypoint(landmark, 5)

    return image


def draw_bounding_rect(use_brect, image, brect):
    if use_brect:
        # Outer rectangle
        cv.rectangle(image, (brect[0], brect[1]), (brect[2], brect[3]),
                     (0, 0, 0), 1)

    return image


def draw_info_text(image, brect, handedness, hand_sign_text,
                   finger_gesture_text):
    cv.rectangle(image, (brect[0], brect[1]), (brect[2], brect[1] - 22),
                 (0, 0, 0), -1)

    info_text = handedness.classification[0].label[0:]
    if hand_sign_text != "":
        info_text = info_text + ':' + hand_sign_text
    cv.putText(image, info_text, (brect[0] + 5, brect[1] - 4),
               cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1, cv.LINE_AA)

    if finger_gesture_text != "":
        cv.putText(image, "Finger Gesture:" + finger_gesture_text, (10, 60),
                   cv.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0), 4, cv.LINE_AA)
        cv.putText(image, "Finger Gesture:" + finger_gesture_text, (10, 60),
                   cv.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2,
                   cv.LINE_AA)

    return image

def calc_bounding_rect(image, landmarks):
    image_width, image_height = image.shape[1], image.shape[0]

    landmark_array = np.empty((0, 2), int)

    for _, landmark in enumerate(landmarks.landmark):
        landmark_x = min(int(landmark.x * image_width), image_width - 1)
        landmark_y = min(int(landmark.y * image_height), image_height - 1)

        landmark_point = [np.array((landmark_x, landmark_y))]

        landmark_array = np.append(landmark_array, landmark_point, axis=0)

    x, y, w, h = cv.boundingRect(landmark_array)

    return [x, y, x + w, y + h]

def calc_landmark_list(image, landmarks):
    image_width, image_height = image.shape[1], image.shape[0]

    landmark_point = []

    # Keypoint
    for _, landmark in enumerate(landmarks.landmark):
        landmark_x = min(int(landmark.x * image_width), image_width - 1)
        landmark_y = min(int(landmark.y * image_height), image_height - 1)

        landmark_point.append([landmark_x, landmark_y])

    return landmark_point

def draw_info(image, mode, number):
    mode_string = ['Logging Key Point', 'Not Logging']
    if 1 <= mode <= 2:
        cv.putText(image, "MODE: " + mode_string[mode - 1], (10, 90),
                   cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1,
                   cv.LINE_AA)
        if 0 <= number <= 25:
            cv.putText(image, "LETTER: " + str(chr(number+97)), (10, 110),
                       cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1,
                       cv.LINE_AA)
    return image


if __name__ == '__main__':
    main()

I0000 00:00:1744667875.017407 4251853 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1 Pro
W0000 00:00:1744667875.024012 4298759 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1744667875.029027 4298759 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.






