<u><h1>Architecture</h1></u>

<img src="architecture.jpeg" />

<u><h1>Implementation</h1></u>

In [None]:
! pip install protobuf
! pip install pynput
! pip install mediapipe

In [1]:
import cv2
import numpy as np
import time
import mediapipe as mp
from google.protobuf.json_format import MessageToDict
from pynput.keyboard import Key, Controller

In [2]:
WINDOW_LENGTH = 640
WINDOW_WIDTH = 480
VIDEO_SOURCE_URL = "http://192.168.1.100:4747/video"
STATUS_PANEL = {
    "origin": (0, 0),
    "width": WINDOW_WIDTH,
    "length": 170
}
HAND_DETECTOR = mp.solutions.hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)
LINE_END_KEYPOINTS = [4, 3, 2, 1, 0, 20, 16, 12, 8]
LEFT_HAND_COLORS = {"keypoint": (0, 255, 0), "line": (0, 0, 255), "palm_center": (0, 255, 255)}
RIGHT_HAND_COLORS = {"keypoint": (0, 0, 255), "line": (204, 0, 0), "palm_center": (0, 255, 255)}
FPS_ORIGIN = (22, 30)
DETECTED_HAND_ORIGIN = (22, 70)
DETECTED_HAND_CONF_ORIGIN = (22, 100)
PALM_CENTER_ORIGIN = (22, 130)
ERRECTED_FINGER_ORIGIN = (22, 180)
DRIVING_STATUS_ORIGIN = (22, 350)
DRIVING_DIRECTION_ORIGIN = (22, 415)
KEYBOARD = Controller()

driving_direction, driving_status = "N/A", "RELEASED"

In [3]:
def detect_hand_landmark_data(image, draw=True):

    img_y, img_x = image.shape[0:2]
    img_copy = image.copy()
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = HAND_DETECTOR.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    result = {"image": None, "hand": "N/A", "score": "N/A", "palm_center": ("N/A"), "keypoints": []}

    if results.multi_hand_landmarks:
        for handedness in results.multi_handedness:
            hand_data = MessageToDict(handedness)["classification"][0]
            hand, confidence = hand_data["label"], hand_data["score"]
            result["hand"] = hand
            result["score"] = str(int(confidence * 100)) + "%"

            colors = LEFT_HAND_COLORS
            if hand == "Right":
                colors = RIGHT_HAND_COLORS

        for landmarks in results.multi_hand_landmarks:
            for keypoint in landmarks.landmark:
                keypoint_data = MessageToDict(keypoint)
                x, y = keypoint_data["x"], keypoint_data["y"]
                decoded_x = int(x * img_x)
                decoded_y = int(y *  img_y)
                result["keypoints"].append((decoded_x, decoded_y))

                if draw:
                    cv2.circle(img_copy, (decoded_x, decoded_y), 3, colors["keypoint"], -1)
            
            keypoints = result["keypoints"]
            palm_center_x = keypoints[17][0] + (keypoints[5][0] - keypoints[17][0]) // 2
            palm_center_y = keypoints[1][1] - (keypoints[1][1] - keypoints[5][1]) // 2
            result["palm_center"] = (palm_center_x, palm_center_y)

            if draw:
                for kpt in LINE_END_KEYPOINTS:
                    kpt_cordinate = keypoints[kpt]
                    cv2.line(img_copy, (palm_center_x, palm_center_y), kpt_cordinate, colors["line"], 1)
                cv2.circle(img_copy, (palm_center_x, palm_center_y), 4, colors["palm_center"], -1)
        if draw:
            result["image"] = img_copy
    else:
        result["image"] = image
    return result

In [4]:
def get_errected_finger_data(landmark_data):
    result = {
        "thumb": "N/A",
        "index": "N/A",
        "middle": "N/A",
        "ring": "N/A",
        "little": "N/A"
    }
    fingers = list(result.keys())
    finger_keypoints = [4, 8, 12, 16, 20]
    keypoints = landmark_data["keypoints"]
    
    if len(keypoints) > 0:
        for i, kpt in enumerate(finger_keypoints):
            x, y = keypoints[kpt]
            if kpt == 4:
                if landmark_data["hand"] == "Right":
                    if x < keypoints[3][0]:
                        result["thumb"] = "UP"
                    else:
                        result["thumb"] = "DOWN"
                else:
                    if x > keypoints[3][0]:
                        result["thumb"] = "UP"
                    else:
                        result["thumb"] = "DOWN"
            else:
                if y < keypoints[kpt - 1][1]:
                    result[fingers[i]] = "UP"
                else:
                    result[fingers[i]] = "DOWN"
    return result

In [5]:
def set_driving_status_and_direction(hand, errected_finger_data):
    global driving_direction, driving_status

    if hand == "N/A":
        driving_status = "RELEASED"
        driving_direction = "N/A"
        return
    if errected_finger_data["thumb"] == errected_finger_data["index"] == errected_finger_data["middle"] == errected_finger_data["ring"] == errected_finger_data["little"] == "UP":
        driving_status = "BREAKED"
    elif errected_finger_data["index"] == "UP" and errected_finger_data["middle"] == errected_finger_data["ring"] == errected_finger_data["little"] == "DOWN":
        driving_status = "DRIVING"
        if hand == "Left":
            driving_direction = "FRONT_LEFT"
        else:
            driving_direction = "BACK_LEFT"
    elif errected_finger_data["little"] == "UP" and errected_finger_data["index"] == errected_finger_data["middle"] == errected_finger_data["ring"] == "DOWN":
        driving_status = "DRIVING"
        if hand == "Left":
            driving_direction = "FRONT_RIGHT"
        else:
            driving_direction = "BACK_RIGHT"
    elif errected_finger_data["thumb"] == errected_finger_data["index"] == errected_finger_data["middle"] == errected_finger_data["ring"] == errected_finger_data["little"] == "DOWN":
        driving_status = "DRIVING"
        if hand == "Left":
            driving_direction = "FRONT"
        else:
            driving_direction = "BACK"

In [6]:
def drive():
    key_1 =  None
    key_2 = None

    if driving_status == "RELEASED":
        return
    if driving_status == "BREAKED":
        key_2 = Key.space
    elif driving_status == "DRIVING":
        if driving_direction == "FRONT":
            key_2 = Key.up
        elif driving_direction == "BACK":
            key_2 = Key.down
        elif driving_direction == "FRONT_LEFT":
            key_1 = Key.up
            key_2 = Key.right
        elif driving_direction == "FRONT_RIGHT":
            key_1 = Key.up
            key_2 = Key.left
        elif driving_direction == "BACK_LEFT":
            key_1 = Key.down
            key_2 = Key.right
        elif driving_direction == "BACK_RIGHT":
            key_1 = Key.down
            key_2 = Key.left
    if key_1:
        KEYBOARD.press(key_1)       
    KEYBOARD.press(key_2)

    cv2.waitKey(80)

    if key_1:
        KEYBOARD.release(key_1)
    KEYBOARD.release(key_2)

In [10]:
def App():
    i = 0
    video_source = cv2.VideoCapture(1)

    video_source.set(3, WINDOW_LENGTH)
    video_source.set(4, WINDOW_WIDTH)
    prev_frame_time, current_frame_time = 0, 0
    
    while video_source.isOpened():
        ret, frame =  video_source.read()

        if ret:
            current_frame_time = time.time()
            fps = int(1 / (current_frame_time - prev_frame_time))
            prev_frame_time = current_frame_time

            image = cv2.flip(frame, 1)
            
            hand_landmark_data = detect_hand_landmark_data(image)
            errected_finger_data = get_errected_finger_data(hand_landmark_data)
            set_driving_status_and_direction(hand_landmark_data["hand"], errected_finger_data)

            image = hand_landmark_data["image"]
            image = cv2.resize(image, (WINDOW_LENGTH - STATUS_PANEL["length"], WINDOW_WIDTH))
            status_panel = np.zeros((STATUS_PANEL["width"], STATUS_PANEL["length"], 3), dtype="uint8")
            
            display_image = np.zeros((WINDOW_WIDTH, WINDOW_LENGTH, 3), dtype="uint8")
            display_image[:, :status_panel.shape[1]] = status_panel
            display_image[:, status_panel.shape[1]:] = image
            
            drive()
            
            cv2.putText(display_image, "FPS: {0}".format(fps), FPS_ORIGIN, cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)
            cv2.putText(display_image, "Hand: {0}".format(hand_landmark_data["hand"]), DETECTED_HAND_ORIGIN, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            cv2.putText(display_image, "Conf: {0}".format(hand_landmark_data["score"]), DETECTED_HAND_CONF_ORIGIN, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            cv2.putText(display_image, "Palm: {0}".format(hand_landmark_data["palm_center"]), PALM_CENTER_ORIGIN, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            cv2.putText(display_image, "Thumb: {0}".format(errected_finger_data["thumb"]), ERRECTED_FINGER_ORIGIN, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            cv2.putText(display_image, "Index: {0}".format(errected_finger_data["index"]), (ERRECTED_FINGER_ORIGIN[0], ERRECTED_FINGER_ORIGIN[1] + 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            cv2.putText(display_image, "Middle: {0}".format(errected_finger_data["middle"]), (ERRECTED_FINGER_ORIGIN[0], ERRECTED_FINGER_ORIGIN[1] + 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            cv2.putText(display_image, "Ring: {0}".format(errected_finger_data["ring"]), (ERRECTED_FINGER_ORIGIN[0], ERRECTED_FINGER_ORIGIN[1] + 90), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            cv2.putText(display_image, "Little: {0}".format(errected_finger_data["little"]), (ERRECTED_FINGER_ORIGIN[0], ERRECTED_FINGER_ORIGIN[1] + 120), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            cv2.putText(display_image, "Status:", DRIVING_STATUS_ORIGIN, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            cv2.putText(display_image, "Direction:", DRIVING_DIRECTION_ORIGIN, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
            
            driving_status_text_color = (0, 128, 255)
            if driving_status == "DRIVING":
                driving_status_text_color = (0, 255, 0)
            elif driving_status == "BREAKED":
                driving_status_text_color = (0, 0, 255)

            driving_direction_text_color = (0, 128, 255)
            if driving_direction != "N/A":
                if driving_direction == "FRONT"  or driving_direction == "FRONT_LEFT" or driving_direction == "FRONT_RIGHT":
                    driving_direction_text_color = (0, 255, 0)
                elif driving_direction == "BACK" or driving_direction == "BACK_LEFT" or driving_direction == "BACK_RIGHT":
                    driving_direction_text_color = (0, 0, 255)

            cv2.putText(display_image, driving_status, (DRIVING_STATUS_ORIGIN[0], DRIVING_STATUS_ORIGIN[1] + 30), cv2.FONT_HERSHEY_SIMPLEX, 0.46, driving_status_text_color, 1)
            cv2.putText(display_image, driving_direction, (DRIVING_DIRECTION_ORIGIN[0], DRIVING_DIRECTION_ORIGIN[1] + 30), cv2.FONT_HERSHEY_SIMPLEX, 0.46, driving_direction_text_color, 1)
            
            cv2.imshow("LIVE | Hand Gesture Game Controller", display_image)
            cv2.imwrite("frames/frame-{0}.jpg".format(i), display_image)
            i += 1
            
        k = cv2.waitKey(1)
        if k == 27:
            break

    video_source.release()
    cv2.destroyAllWindows()

App()

<u><h1>References</h1></u>

<ul>
    <li><a href="https://google.github.io/mediapipe/solutions/hands">Mediapipe Hands</a></li>
    <li><a href="https://pynput.readthedocs.io/en/latest/">Pynput</a></li>
    <li><a href="https://www.gametop.com/download-free-games/dirt-rally-driver-hd/">Dirt Rally Driver Car Game</a></li>
</ul>