### Hand Tracking with OpenCV and Mediapipe!

**Steps to Get Started:** (Also reference README within this folder)
1. Clone this repository
2. Ensure you have Python 3.11.9 installed on your computer (we will go through how to set it up)
3. Create a virtual environment using this python version
4. pip install requirements.txt (ensures no issues with different versions)
5. Run the first code block with imports to ensure it works properly
5. Once you've installed the necessary packages, you're ready to begin!

Helpful Article:
https://medium.com/@Mert.A/how-to-create-a-finger-counter-with-python-and-mediapipe-cc6c3911ad09

Helpful Video:
https://www.youtube.com/watch?v=RRBXVu5UE-U&t=78s

Docs:
https://mediapipe.readthedocs.io/en/latest/solutions/hands.html#:~:text=%7C-,ML%20Pipeline,a%20dedicated%20hand%20renderer%20subgraph.

Setting Up Python in VS Code:
https://code.visualstudio.com/docs/python/python-tutorial

In [17]:
import pyautogui
import cv2
import mediapipe as mp
import numpy as np
import time

print("Everything imported successfully!")

Everything imported successfully!


In [18]:
mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.7)
mpDraw = mp.solutions.drawing_utils

I0000 00:00:1758054556.469449  742732 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M2 Pro


In [19]:
s = pyautogui.size()

In [20]:
# Initialize webcam
cap = cv2.VideoCapture(0)

while True:
    # Capture a frame from the webcam
    # ret is a boolean of whether frame was read, frame is numpy array in BGR format
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame horizontally for a mirror effect
    frame = cv2.flip(frame, 1)

    # Display the frame in a window titled 'Output'
    cv2.imshow('Output', frame)

    # Exit the loop if 'q' is pressed in the OpenCV window
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close any OpenCV windows
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1) # delay of 1 second each time it reads a key press

W0000 00:00:1758054556.518222  766919 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1758054556.541326  766928 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


-1

In [21]:
#Hand tracking
# Initialize webcam
cap = cv2.VideoCapture(0)
while True:
    # capture webcame frame and shape (width and height)
    ret, frame = cap.read()
    # defining the frame
    frame = cv2.flip(frame, 1)
    # -------- ADD THIS CODE NEXT ---------------
    # have mediapipe hands predict hand landmarks
    x, y, c = frame.shape
    result = hands.process(frame)
    # iterate through the predicted landmarks adjusting them to the window, and
    # and outputting them to the opencv window
    if result.multi_hand_landmarks:
        landmarks = []
        for handslms in result.multi_hand_landmarks:
            for lm in handslms.landmark:
                lmx = int(lm.x * x)
                lmy = int(lm.y * y)
                landmarks.append([lmx, lmy])
            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)
    # ------- FINISHED ADDING NEW CODE ----------
    cv2.imshow('Output', frame)
    # if q is pressed, program exits
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

-1

<img src="hand_tracking_landmarks.webp" alt="Hand Tracker Visualization with Mediapipe" width="400">

In [22]:
#Moving Cursor
# Initialize webcam
cap = cv2.VideoCapture(0)
while True:
    # capture webcame frame and shape (width and height)
    ret, frame = cap.read()
    # defining the frame
    frame = cv2.flip(frame, 1)
    x, y, c = frame.shape
    # have mediapipe hands predict hand landmarks
    result = hands.process(frame)
    # iterate through the predicted landmarks adjusting them to the window, and
    # and outputting them to the opencv window
    if result.multi_hand_landmarks:
        landmarks = []
        for handslms in result.multi_hand_landmarks:
            # take the 8th landmark (index finger point) and move the cursor to that landmarks x and y value
            # ADD THE LINE BELOW NEXT
            index_x = int(handslms.landmark[8].x * s[0])
            index_y = int(handslms.landmark[8].y * s[1])
            pyautogui.moveTo(index_x, index_y, _pause=False)
            for lm in handslms.landmark:
                lmx = int(lm.x * x)
                lmy = int(lm.y * y)
                landmarks.append([lmx, lmy])
            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)

    cv2.imshow('Output', frame)
    # if q is pressed, program exits
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

-1

In [23]:
#Clicking
# Initialize webcam
cap = cv2.VideoCapture(0)
# -------- ADD THIS VARIABLE ---------------
last_click = 0
while True:
    # capture webcame frame and shape (width and height)
    ret, frame = cap.read()
    x, y, c = frame.shape
    # defining the frame
    frame = cv2.flip(frame, 1)
    # have mediapipe hands predict hand landmarks
    result = hands.process(frame)
    # iterate through the predicted landmarks adjusting them to the window, and
    # and outputting them to the opencv window
    if result.multi_hand_landmarks:
        landmarks = []
        for handslms in result.multi_hand_landmarks:
            # take the 8th landmark (index finger point) and move the cursor to that landmarks x and y value
            pyautogui.moveTo(int(handslms.landmark[8].x * s[0]), int(handslms.landmark[8].y * s[1]), _pause=False)

            # -------- ADD THIS CODE NEXT ---------------
            # Detect click gesture by calculating distance between thumb (landmark 4) and index (landmark 8)
            thumb_x = int(handslms.landmark[4].x * s[0])
            thumb_y = int(handslms.landmark[4].y * s[1])
            distance = ((index_x - thumb_x) ** 2 + (index_y - thumb_y) ** 2) ** 0.5

            # If distance is small enough, simulate a click
            if distance < 40 and time.time() - last_click > 0.5:
                pyautogui.click()
                last_click = time.time()
            # ------- FINISHED ADDING NEW CODE ----------

            # for lm in handslms.landmark:
            #     lmx = int(lm.x * x)
            #     lmy = int(lm.y * y)
            #     landmarks.append([lmx, lmy])
            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)
    cv2.imshow('Output', frame)
    # if q is pressed, program exits
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

KeyboardInterrupt: 

In [None]:
# Better Click Detection Methods

def normalized_distance(handslms, landmark1, landmark2):
    """Calculate normalized distance between two landmarks"""
    x1, y1 = handslms.landmark[landmark1].x, handslms.landmark[landmark1].y
    x2, y2 = handslms.landmark[landmark2].x, handslms.landmark[landmark2].y
    return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5

def is_pinching(handslms, threshold=0.05):
    """
    Better pinch detection using normalized coordinates
    Returns True if thumb tip and index tip are close enough
    """

    
    # Calculate distance in normalized space
    distance = normalized_distance(handslms, 4, 8)
    
    return distance < threshold

def advanced_pinch_detection(handslms, threshold=0.04):
    """
    More sophisticated pinch detection considering finger angles
    """
    # Check if thumb and index are extended
    thumb_tip = handslms.landmark[4]
    thumb_mcp = handslms.landmark[2]  # Thumb MCP joint
    index_tip = handslms.landmark[8``]
    index_pip = handslms.landmark[6]  # Index PIP joint
    
    # Check if fingers are extended (simple check)
    thumb_extended = thumb_tip.y < thumb_mcp.y
    index_extended = index_tip.y < index_pip.y
    
    # Only consider pinch if both fingers are somewhat extended
    if thumb_extended and index_extended:
        distance = normalized_distance(handslms, 4, 8)
        return distance < threshold
    
    return False

# Test the improved methods
print("Improved click detection functions loaded!")

Improved click detection functions loaded!


In [39]:
# Dynamic Line Following Thumb and Index with Normalized Distance
cap = cv2.VideoCapture(0)
last_click = 0
click_threshold = 0.04  # Normalized distance threshold
debounce_time = 0.3     # Minimum time between clicks

while True:
    ret, frame = cap.read()
    if not ret:
        break
        
    frame = cv2.flip(frame, 1)
    h, w, c = frame.shape
    
    # Convert to RGB for MediaPipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)
    
    if result.multi_hand_landmarks:
        for handslms in result.multi_hand_landmarks:
            # Get thumb and index finger positions in FRAME coordinates (for drawing)
            thumb_x = int(handslms.landmark[4].x * w)
            thumb_y = int(handslms.landmark[4].y * h)
            index_x_frame = int(handslms.landmark[8].x * w)
            index_y_frame = int(handslms.landmark[8].y * h)
            
            # Move cursor to index finger tip (SCREEN coordinates)
            index_x_screen = int(handslms.landmark[8].x * s[0])
            index_y_screen = int(handslms.landmark[8].y * s[1])
            pyautogui.moveTo(index_x_screen, index_y_screen, _pause=False)
            
            # Use YOUR normalized distance function
            norm_distance = normalized_distance(handslms, 4, 8)
            
            # Dynamic line that follows thumb and index finger tips
            # Color changes based on normalized distance
            if norm_distance < click_threshold:
                line_color = (0, 0, 255)  # Red when clicking distance
                line_thickness = 4
            elif norm_distance < 0.08:
                line_color = (0, 165, 255)  # Orange when getting close
                line_thickness = 3
            else:
                line_color = (0, 255, 0)  # Green when far apart
                line_thickness = 2
                
            # Draw the dynamic line connecting thumb and index finger
            cv2.line(frame, (thumb_x, thumb_y), (index_x_frame, index_y_frame), line_color, line_thickness)
            
            # Use YOUR advanced pinch detection for clicking
            if advanced_pinch_detection(handslms, click_threshold):
                current_time = time.time()
                if current_time - last_click > debounce_time:
                    pyautogui.click()
                    last_click = current_time
                    print(f"Click detected! Distance: {norm_distance:.3f}")
            
            # Visual feedback circles at finger tips
            if is_pinching(handslms, click_threshold):
                # Red circles when pinching
                cv2.circle(frame, (thumb_x, thumb_y), 12, (0, 0, 255), -1)
                cv2.circle(frame, (index_x_frame, index_y_frame), 12, (0, 0, 255), -1)
                # Outer rings for emphasis
                cv2.circle(frame, (thumb_x, thumb_y), 18, (0, 0, 255), 2)
                cv2.circle(frame, (index_x_frame, index_y_frame), 18, (0, 0, 255), 2)
            else:
                # Blue circles normally
                cv2.circle(frame, (thumb_x, thumb_y), 8, (255, 0, 0), -1)
                cv2.circle(frame, (index_x_frame, index_y_frame), 8, (255, 0, 0), -1)
            
            # Draw hand landmarks
            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)
            
            # Display normalized distance info
            cv2.putText(frame, f"Norm Distance: {norm_distance:.3f}", (10, 30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
            cv2.putText(frame, f"Threshold: {click_threshold}", (10, 60), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
    
    cv2.imshow('Hand Tracking with Dynamic Line', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

Click detected! Distance: 0.039
Click detected! Distance: 0.037
Click detected! Distance: 0.037


KeyboardInterrupt: 

In [36]:
# Improved Clicking with Better Pinch Detection
cap = cv2.VideoCapture(0)
last_click = 0
click_threshold = 0.04  # Normalized distance threshold
debounce_time = 0.5     # Minimum time between clicks

while True:
    ret, frame = cap.read()
    if not ret:
        break
        
    frame = cv2.flip(frame, 1)
    x, y, c = frame.shape
    
    # Convert to RGB for MediaPipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)
    
    if result.multi_hand_landmarks:
        for handslms in result.multi_hand_landmarks:
            # Move cursor to index finger tip
            index_x = int(handslms.landmark[8].x * s[0])
            index_y = int(handslms.landmark[8].y * s[1])
            pyautogui.moveTo(index_x, index_y, _pause=False)
            
            # Advanced pinch detection
            if advanced_pinch_detection(handslms, click_threshold):
                current_time = time.time()
                if current_time - last_click > debounce_time:
                    pyautogui.click()
                    last_click = current_time
                    print("Click detected!")
            
            # Visual feedback - draw circle when pinching
            thumb_x = int(handslms.landmark[4].x * x)
            thumb_y = int(handslms.landmark[4].y * y)
            index_x_frame = int(handslms.landmark[8].x * x)
            index_y_frame = int(handslms.landmark[8].y * y)
            
            # Draw connection line between thumb and index
            cv2.line(frame, (thumb_x, thumb_y), (index_x_frame, index_y_frame), (0, 255, 0), 2)
            
            # Change color based on pinch state
            if is_pinching(handslms, click_threshold):
                cv2.circle(frame, (thumb_x, thumb_y), 10, (0, 0, 255), -1)  # Red when pinching
                cv2.circle(frame, (index_x_frame, index_y_frame), 10, (0, 0, 255), -1)
            else:
                cv2.circle(frame, (thumb_x, thumb_y), 8, (255, 0, 0), -1)  # Blue normally
                cv2.circle(frame, (index_x_frame, index_y_frame), 8, (255, 0, 0), -1)
            
            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)
    
    cv2.imshow('Output', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)# Alternative Gesture-Based Clicking Methods

def finger_gun_click(handslms):
    """
    Detect 'finger gun' gesture (index extended, others folded)
    """
    # Check if index finger is extended and middle finger is folded
    index_tip = handslms.landmark[8]
    index_pip = handslms.landmark[6]
    middle_tip = handslms.landmark[12]
    middle_pip = handslms.landmark[10]
    
    index_extended = index_tip.y < index_pip.y
    middle_folded = middle_tip.y > middle_pip.y
    
    return index_extended and middle_folded

def peace_sign_click(handslms):
    """
    Detect peace sign (index and middle extended, others folded)
    """
    index_tip = handslms.landmark[8]
    index_pip = handslms.landmark[6]
    middle_tip = handslms.landmark[12]
    middle_pip = handslms.landmark[10]
    ring_tip = handslms.landmark[16]
    ring_pip = handslms.landmark[14]
    
    index_extended = index_tip.y < index_pip.y
    middle_extended = middle_tip.y < middle_pip.y
    ring_folded = ring_tip.y > ring_pip.y
    
    return index_extended and middle_extended and ring_folded

def thumb_up_click(handslms):
    """
    Detect thumbs up gesture
    """
    thumb_tip = handslms.landmark[4]
    thumb_mcp = handslms.landmark[2]
    index_tip = handslms.landmark[8]
    index_pip = handslms.landmark[6]
    
    # Thumb extended upward, index folded
    thumb_up = thumb_tip.y < thumb_mcp.y
    index_folded = index_tip.y > index_pip.y
    
    return thumb_up and index_folded

print("Alternative gesture detection functions loaded!")

Click detected!
Click detected!
Click detected!
Click detected!
Click detected!
Click detected!
Click detected!
Click detected!


KeyboardInterrupt: 

In [None]:
def sum_dict(dict):
    result = 0
    for key in dict:
        result += dict[key]
    return result

In [30]:
# Adding Finger Counter

# initialize web cam
cap = cv2.VideoCapture(0)

while True:

    # capture webcame frame and shape (width and height)
    ret, frame = cap.read()

    if not ret:
        break

    # flip frame for mirror effect
    frame = cv2.flip(frame, 1)

    h, w, c = frame.shape

    # convert to rgb for mediapipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # have mediapipe hands predict hand landmarks
    result = hands.process(rgb_frame)

    # dict of fingers
    fingers = {"4": 0, "8": 0, "12": 0, "16": 0, "20": 0}


    # iterate through the predicted landmarks adjusting them to the window, and
    # and outputting them to the opencv window
    if result.multi_hand_landmarks:
        for handslms in result.multi_hand_landmarks:

            # thumb coordinates

            # a note here is we have to scale coordinates relative to the frame (height, width)
            x4, y4 = int(handslms.landmark[4].x * w), int(handslms.landmark[4].y * h)
            x2, y2 = int(handslms.landmark[2].x * w), int(handslms.landmark[2].y * h)

            # check if thumb is up
            if x4 > x2:
                fingers['4'] = 1
            
            # index finger coordinates
            x8, y8 = int(handslms.landmark[8].x * w), int(handslms.landmark[8].y * h)
            x6, y6 = int(handslms.landmark[6].x * w), int(handslms.landmark[6].y * h)

            if y8 < y6:
                fingers['8'] = 1
            
            # middle finger
            x12, y12 = int(handslms.landmark[12].x * w), int(handslms.landmark[12].y * h)
            x10, y10 = int(handslms.landmark[10].x * w), int(handslms.landmark[10].y * h)

            if y12 < y10:
                fingers['12'] = 1
            
            # ring finger
            x16, y16 = int(handslms.landmark[16].x * w), int(handslms.landmark[16].y * h)
            x14, y14 = int(handslms.landmark[14].x * w), int(handslms.landmark[14].y * h)

            if y16 < y14:
                fingers['16'] = 1
            
            # pinky finger
            x20, y20 = int(handslms.landmark[20].x * w), int(handslms.landmark[20].y * h)
            x18, y18 = int(handslms.landmark[18].x * w), int(handslms.landmark[18].y * h)
        
            if y20 < y18:
                fingers['20'] = 1

            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)

    # count raised fingers
    number_fingers = sum_dict(fingers)

    cv2.rectangle(frame, (25, 150), (100, 400), (0, 128, 0), cv2.FILLED)
    cv2.putText(frame, str(number_fingers), (35, 300), cv2.FONT_HERSHEY_PLAIN,
                3, (0, 71, 71), 2)
    
    cv2.imshow('Output', frame)
    # if q is pressed, program exits
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)
    

-1

Article if you have trouble understanding! https://medium.com/@Mert.A/how-to-create-a-finger-counter-with-python-and-mediapipe-cc6c3911ad09