### Hand Tracking with OpenCV and Mediapipe!

**Steps to Get Started:** (Also reference README within the github repo!)
1. Clone this repository
2. Ensure you have Python 3.11.9 installed on your computer (we will go through how to set it up)
3. Create a virtual environment using this python version
4. pip install requirements.txt (ensures no issues with different versions)
5. Run the first code block with imports to ensure it works properly
5. Once you've installed the necessary packages, you're ready to begin!

Helpful Article:
https://medium.com/@Mert.A/how-to-create-a-finger-counter-with-python-and-mediapipe-cc6c3911ad09

Helpful Video:
https://www.youtube.com/watch?v=RRBXVu5UE-U&t=78s

Docs:
https://mediapipe.readthedocs.io/en/latest/solutions/hands.html#:~:text=%7C-,ML%20Pipeline,a%20dedicated%20hand%20renderer%20subgraph.

Setting Up Python in VS Code:
https://code.visualstudio.com/docs/python/python-tutorial

### Necessary Imports

In [4]:
import pyautogui
import cv2
import mediapipe as mp
import numpy as np
import time

print("Everything imported successfully!")

Everything imported successfully!


### Initialize Hand Detection Model with Mediapipe

In [5]:
mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.7)
mpDraw = mp.solutions.drawing_utils

In [6]:
s = pyautogui.size() # store size of gui screen

In [7]:
# Initialize webcam
cap = cv2.VideoCapture(0)

while True:
    # Capture a frame from the webcam
    # ret is a boolean of whether frame was read, frame is numpy array in BGR format
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame horizontally for a mirror effect
    frame = cv2.flip(frame, 1)

    # Display the frame in a window titled 'Output'
    cv2.imshow('Output', frame)

    # Exit the loop if 'q' is pressed in the OpenCV window
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close any OpenCV windows
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1) # delay of 1 second each time it reads a key press

-1

### Hand Tracking with Webcam

In [9]:
# Hand tracking
# Initialize webcam
cap = cv2.VideoCapture(0)
while True:
    # capture webcame frame and shape (width and height)
    ret, frame = cap.read()
    # defining the frame
    frame = cv2.flip(frame, 1)
    # -------- ADD THIS CODE NEXT ---------------
    # have mediapipe hands predict hand landmarks
    x, y, c = frame.shape
    result = hands.process(frame)

    # check if predicted landmarks were detected
    # if so draw & output them to the opencv window
    if result.multi_hand_landmarks:
        handLandmarks = result.multi_hand_landmarks[0]

        # draw landmarks on the screen
        mpDraw.draw_landmarks(frame, handLandmarks, mpHands.HAND_CONNECTIONS)
    # ------- FINISHED ADDING NEW CODE ----------
    cv2.imshow('Output', frame)
    # if q is pressed, program exits
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

-1

<img src="hand_tracking_landmarks.webp" alt="Hand Tracker Visualization with Mediapipe" width="400">

### Moving Cursor Demo

In [10]:
#Moving Cursor

# Initialize webcam
cap = cv2.VideoCapture(0)
while True:
    # capture webcame frame and shape (width and height)
    ret, frame = cap.read()
    # defining the frame
    frame = cv2.flip(frame, 1)
    x, y, c = frame.shape
    # have mediapipe hands predict hand landmarks
    result = hands.process(frame)

    # check if predicted landmarks were detected
    # if so draw & output them to the opencv window
    if result.multi_hand_landmarks:

        # grab landmarks of first instance of hand detected
        handLandmarks = result.multi_hand_landmarks[0]

        # take the 8th landmark (index finger point) and move the cursor to that landmarks x and y value
        index_x = int(handLandmarks.landmark[8].x * s[0]) # grab the x-coordinate of the index finger, scaled to screen width
        index_y = int(handLandmarks.landmark[8].y * s[1]) # grab the y-coordinate of the index finger, scaled to the screen height
        pyautogui.moveTo(index_x, index_y, _pause=False)
        
        # draw landmarks onto opencv window as before
        mpDraw.draw_landmarks(frame, handLandmarks, mpHands.HAND_CONNECTIONS)

    cv2.imshow('Output', frame)
    # if q is pressed, program exits
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

-1

### Clicking Using Hand Gestures (Pinch Detection)

In [12]:
# some helper functions to enable better pinch detection since was buggy before

def normalized_distance(handslms, landmark1, landmark2):
    """Calculate normalized distance between two landmarks"""
    x1, y1 = handslms.landmark[landmark1].x, handslms.landmark[landmark1].y
    x2, y2 = handslms.landmark[landmark2].x, handslms.landmark[landmark2].y
    return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5 # return distance between two

def is_pinching(handslms, threshold=0.05):
    """
    Better pinch detection using normalized coordinates
    Returns True if thumb tip and index tip are close enough
    """
    # Calculate distance in normalized space
    distance = normalized_distance(handslms, 4, 8)
    
    return distance < threshold

def advanced_pinch_detection(handslms, threshold=0.04):
    """
    More sophisticated pinch detection considering finger angles
    """
    # Check if thumb and index are extended
    thumb_tip = handslms.landmark[4]
    thumb_mcp = handslms.landmark[2]  # Thumb MCP joint
    index_tip = handslms.landmark[8]
    index_pip = handslms.landmark[6]  # Index PIP joint
    
    # Check if fingers are extended (simple check)
    thumb_extended = thumb_tip.y < thumb_mcp.y
    index_extended = index_tip.y < index_pip.y
    
    # Only consider pinch if both fingers are somewhat extended
    if thumb_extended and index_extended:
        distance = normalized_distance(handslms, 4, 8)
        return distance < threshold
    
    return False

# Test the improved methods
print("Improved click detection functions loaded!")

Improved click detection functions loaded!


In [17]:
#Clicking

# Initialize webcam
cap = cv2.VideoCapture(0)
# -------- ADD THIS VARIABLE ---------------
last_click = 0 # keeps track of time of last click
while True:
    # capture webcame frame and shape (width and height)
    ret, frame = cap.read()
    x, y, c = frame.shape
    # defining the frame
    frame = cv2.flip(frame, 1)
    # have mediapipe hands predict hand landmarks
    result = hands.process(frame)

    if result.multi_hand_landmarks:
        # grab landmarks of first instance of hand detected
        handLandmarks = result.multi_hand_landmarks[0]

        # take the 8th landmark (index finger point) and move the cursor to that landmarks x and y value as done above
        pyautogui.moveTo(index_x, index_y, _pause=False)

        # Draw line between thumb tip (4) and index tip (8)
        # in this case we are scaling to the camera frame since we are drawing
        
        thumb_x = int(handLandmarks.landmark[4].x * s[0])
        thumb_y = int(handLandmarks.landmark[4].y * s[1])
        index_x = int(handLandmarks.landmark[8].x * s[0])
        index_y = int(handLandmarks.landmark[8].y * s[1])
        cv2.line(frame, (thumb_x, thumb_y), (index_x, index_y), (0, 255, 0), 3) # color is green and width is 3

        # Detect click gesture by calculating distance between thumb (landmark 4) and index (landmark 8)
        # If distance is within threshold & time between clicks is greater than .5 milliseconds, simulate a click

        if is_pinching(handLandmarks) and time.time() - last_click > 0.5:
            pyautogui.click()
            last_click = time.time()

        # **NOTE**: You can replace this function with the more advanced one and try it out!
        
        # ------- FINISHED ADDING NEW CODE ----------

        mpDraw.draw_landmarks(frame, handLandmarks, mpHands.HAND_CONNECTIONS)
    cv2.imshow('Output', frame)
    # if q is pressed, program exits
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

-1

### Finger Counter Demo

In [18]:
def sum_dict(dict):
    result = 0
    for key in dict:
        result += dict[key]
    return result

In [19]:
# Adding Finger Counter

# initialize web cam
cap = cv2.VideoCapture(0)

while True:

    # capture webcame frame and shape (width and height)
    ret, frame = cap.read()

    if not ret:
        break

    # flip frame for mirror effect
    frame = cv2.flip(frame, 1)

    h, w, c = frame.shape

    # convert from bgr to rgb for mediapipe, this is optional
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # have mediapipe hands predict hand landmarks
    result = hands.process(rgb_frame)

    # dict of fingers where key is landmark and value is 1 or 0 (detected or not)
    fingers = {"4": 0, "8": 0, "12": 0, "16": 0, "20": 0}


    # check if predicted landmarks were detected
    # if so draw & output them to the opencv window
    if result.multi_hand_landmarks:
        
        handLandmarks = result.multi_hand_landmarks[0]
        # thumb coordinates

        # a note here is we have to scale coordinates relative to the frame (height, width)
        x4, y4 = int(handLandmarks.landmark[4].x * w), int(handLandmarks.landmark[4].y * h)
        x2, y2 = int(handLandmarks.landmark[2].x * w), int(handLandmarks.landmark[2].y * h)

        # check if thumb is up
        if x4 > x2:
            fingers['4'] = 1
        
        # index finger coordinates
        x8, y8 = int(handLandmarks.landmark[8].x * w), int(handLandmarks.landmark[8].y * h)
        x6, y6 = int(handLandmarks.landmark[6].x * w), int(handLandmarks.landmark[6].y * h)

        if y8 < y6:
            fingers['8'] = 1
        
        # middle finger
        x12, y12 = int(handLandmarks.landmark[12].x * w), int(handLandmarks.landmark[12].y * h)
        x10, y10 = int(handLandmarks.landmark[10].x * w), int(handLandmarks.landmark[10].y * h)

        if y12 < y10:
            fingers['12'] = 1
        
        # ring finger
        x16, y16 = int(handLandmarks.landmark[16].x * w), int(handLandmarks.landmark[16].y * h)
        x14, y14 = int(handLandmarks.landmark[14].x * w), int(handLandmarks.landmark[14].y * h)

        if y16 < y14:
            fingers['16'] = 1
        
        # pinky finger
        x20, y20 = int(handLandmarks.landmark[20].x * w), int(handLandmarks.landmark[20].y * h)
        x18, y18 = int(handLandmarks.landmark[18].x * w), int(handLandmarks.landmark[18].y * h)
    
        if y20 < y18:
            fingers['20'] = 1

        mpDraw.draw_landmarks(frame, handLandmarks, mpHands.HAND_CONNECTIONS)

    # count raised fingers
    number_fingers = sum_dict(fingers)

    # draw finger counter visual
    cv2.rectangle(frame, (25, 150), (100, 400), (0, 128, 0), cv2.FILLED)
    cv2.putText(frame, str(number_fingers), (35, 300), cv2.FONT_HERSHEY_PLAIN,
                3, (0, 71, 71), 2)
    
    cv2.imshow('Output', frame)
    # if q is pressed, program exits
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)
    

-1

Article if you have trouble understanding! https://medium.com/@Mert.A/how-to-create-a-finger-counter-with-python-and-mediapipe-cc6c3911ad09

Hope you enjoyed this workshop!!


Some food for thought:
- Can you modify this workshop to work for two hands? If so, what different functionality can you add?