In [1]:
pip install opencv-python mediapipe pyautogui

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Import Libraries and Set Up Hand Detection

In [37]:
import cv2
import mediapipe as mp
import pyautogui
import time
import numpy as np
from collections import deque

# Initialize Mediapipe hand detection with higher confidence thresholds
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.8, min_tracking_confidence=0.8)
mp_draw = mp.solutions.drawing_utils

# Variables for delay management
last_play_pause_time = time.time()
last_volume_change_time = time.time()
last_gesture_time = time.time()

# Thresholds
play_pause_delay = 1  # 1 second delay for play/pause
volume_change_delay = 0.2  # 1/5 second delay for volume change
gesture_delay = 1  # 1 second delay for gestures
movement_threshold = 0.005  # adjust threshold for detecting meaningful fist movements

# Gesture States
is_open_palm = False
is_fist = False
previous_fist_position = None

# Frame rate control
target_fps = 24
frame_duration = 1.0 / target_fps

# Smoothing settings
smoothing_window_size = 5
landmark_queue = deque(maxlen=smoothing_window_size)

# Capture video from webcam
cap = cv2.VideoCapture(0)

def is_open_palm_landmarks(landmarks):
    return landmarks[8][1] < landmarks[6][1] and \
           landmarks[12][1] < landmarks[10][1] and \
           landmarks[16][1] < landmarks[14][1] and \
           landmarks[20][1] < landmarks[18][1]

def is_fist_landmarks(landmarks):
    return landmarks[8][1] > landmarks[6][1] and \
           landmarks[12][1] > landmarks[10][1] and \
           landmarks[16][1] > landmarks[14][1] and \
           landmarks[20][1] > landmarks[18][1]

def is_index_finger_pointing_up(landmarks):
    return landmarks[8][1] < landmarks[6][1] and \
           landmarks[12][1] > landmarks[10][1] and \
           landmarks[16][1] > landmarks[14][1] and \
           landmarks[20][1] > landmarks[18][1]

def is_thumb_down(landmarks):
    return landmarks[4][1] > landmarks[3][1] and \
           landmarks[8][1] < landmarks[6][1] and \
           landmarks[12][1] < landmarks[10][1] and \
           landmarks[16][1] < landmarks[14][1] and \
           landmarks[20][1] < landmarks[18][1]

def get_fist_direction(current_position, previous_position):
    if current_position is not None and previous_position is not None:
        x_movement = current_position[0] - previous_position[0]
        print(f"X movement: {x_movement}")  # Debugging: print movement amount
        if abs(x_movement) > movement_threshold:
            if x_movement < 0:
                print("Fist moved left")
                return "left"
            elif x_movement > 0:
                print("Fist moved right")
                return "right"
    return None

def smooth_landmarks(landmarks):
    landmark_queue.append(landmarks)
    avg_landmarks = np.mean(landmark_queue, axis=0)
    return avg_landmarks

while cap.isOpened():
    start_time = time.time()

    success, frame = cap.read()
    if not success:
        print("Failed to capture frame")
        break
    
    # Convert the image to RGB
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    
    # Process the image and detect hands
    results = hands.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Convert landmarks to numpy array for smoothing
            landmarks_array = np.array([(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark])
            
            # Smooth the landmarks
            smoothed_landmarks = smooth_landmarks(landmarks_array)
            
            # Check if the hand is open palm
            if is_open_palm_landmarks(smoothed_landmarks):
                if not is_open_palm and (time.time() - last_play_pause_time) > play_pause_delay:
                    print("Open palm detected: Play/Pause triggered")
                    pyautogui.press('space')
                    last_play_pause_time = time.time()
                    is_open_palm = True
                is_fist = False  # Reset fist status
            else:
                is_open_palm = False
            
            # Check if the hand is in a fist
            if is_fist_landmarks(smoothed_landmarks):
                print("Fist detected")
                current_position = smoothed_landmarks[9]  # Use landmark 9 as reference for fist position
                if is_fist and previous_fist_position is not None:
                    direction = get_fist_direction(current_position, previous_fist_position)
                    if direction == "left" and (time.time() - last_volume_change_time) > volume_change_delay:
                        print("Volume Down triggered")
                        pyautogui.press('volumedown')
                        last_volume_change_time = time.time()
                    elif direction == "right" and (time.time() - last_volume_change_time) > volume_change_delay:
                        print("Volume Up triggered")
                        pyautogui.press('volumeup')
                        last_volume_change_time = time.time()
                is_fist = True
                previous_fist_position = current_position  # Update the previous fist position
            else:
                is_fist = False

            # Check for index finger pointing up
            if is_index_finger_pointing_up(smoothed_landmarks) and (time.time() - last_gesture_time) > gesture_delay:
                print("Index finger up detected: Fast forward 5 secs")
                pyautogui.press('right')
                last_gesture_time = time.time()

            # Check for thumb down
            if is_thumb_down(smoothed_landmarks) and (time.time() - last_gesture_time) > gesture_delay:
                print("Thumb down detected: Rewind 5 secs")
                pyautogui.press('left')
                last_gesture_time = time.time()

            # Draw hand landmarks
            mp_draw.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    
    # Display the video feed
    cv2.imshow('Hand Gesture Control', image)
    
    # Break loop with 'q' key
    if cv2.waitKey(1) & 0xFF == ord('q'):
        print("Quitting program")
        break

    # Frame rate control
    end_time = time.time()
    elapsed_time = end_time - start_time
    if elapsed_time < frame_duration:
        time.sleep(frame_duration - elapsed_time)

cap.release()
cv2.destroyAllWindows()


Open palm detected: Play/Pause triggered
Index finger up detected: Fast forward 5 secs
Index finger up detected: Fast forward 5 secs
Index finger up detected: Fast forward 5 secs
Index finger up detected: Fast forward 5 secs
Fist detected
Fist detected
X movement: -0.004591739177703835
Fist detected
X movement: -0.004750436544418368
Fist detected
X movement: -0.0017773985862731823
Fist detected
X movement: -0.001065969467163086
Fist detected
X movement: -0.0009798467159271018
Fist detected
X movement: 0.00032973289489746094
Fist detected
X movement: 7.787942886350319e-05
Fist detected
X movement: -8.02934169769065e-05
Fist detected
X movement: -0.00036832690238952637
Fist detected
X movement: -0.00031139254570006214
Fist detected
X movement: -0.00023632049560551316
Fist detected
X movement: 0.00011552572250367321
Fist detected
X movement: -3.1620264053344727e-05
Fist detected
X movement: 0.0003204762935638539
Fist detected
X movement: 0.0003007471561431996
Fist detected
X movement: -2.

Capture Video and Detect Gestures

In [32]:
while cap.isOpened():
    start_time = time.time()

    success, frame = cap.read()
    if not success:
        print("Failed to capture frame")
        break
    
    # Convert the image to RGB
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    
    # Process the image and detect hands
    results = hands.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Convert landmarks to numpy array for smoothing
            landmarks_array = np.array([(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark])
            
            # Smooth the landmarks
            smoothed_landmarks = smooth_landmarks(landmarks_array)
            
            # Check if the hand is open palm
            if is_open_palm_landmarks(smoothed_landmarks):
                if not is_open_palm and (time.time() - last_play_pause_time) > play_pause_delay:
                    print("Open palm detected: Play/Pause triggered")
                    pyautogui.press('space')
                    last_play_pause_time = time.time()
                    is_open_palm = True
                is_fist = False  # Reset fist status
            else:
                is_open_palm = False
            
            # Check if the hand is in a fist
            if is_fist_landmarks(smoothed_landmarks):
                print("Fist detected")
                current_position = smoothed_landmarks[9]  # Use landmark 9 as reference for fist position
                if is_fist and previous_fist_position is not None:
                    direction = get_fist_direction(current_position, previous_fist_position)
                    if direction == "left" and (time.time() - last_volume_change_time) > volume_change_delay:
                        print("Volume Down triggered")
                        pyautogui.press('volumedown')
                        last_volume_change_time = time.time()
                    elif direction == "right" and (time.time() - last_volume_change_time) > volume_change_delay:
                        print("Volume Up triggered")
                        pyautogui.press('volumeup')
                        last_volume_change_time = time.time()
                is_fist = True
                previous_fist_position = current_position  # Update the previous fist position
            else:
                is_fist = False

            # Check for thumbs up
            if is_thumbs_up_landmarks(smoothed_landmarks) and (time.time() - last_gesture_time) > gesture_delay:
                print("Thumbs up detected: Fast forward 5 secs")
                pyautogui.press('right')
                last_gesture_time = time.time()

            # Check for thumbs down
            if is_thumbs_down_landmarks(smoothed_landmarks) and (time.time() - last_gesture_time) > gesture_delay:
                print("Thumbs down detected: Rewind 5 secs")
                pyautogui.press('left')
                last_gesture_time = time.time()

            # Draw hand landmarks
            mp_draw.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    
    # Display the video feed
    cv2.imshow('Hand Gesture Control', image)
    
    # Break loop with 'q' key
    if cv2.waitKey(1) & 0xFF == ord('q'):
        print("Quitting program")
        break

    # Frame rate control
    end_time = time.time()
    elapsed_time = end_time - start_time
    if elapsed_time < frame_duration:
        time.sleep(frame_duration - elapsed_time)

cap.release()
cv2.destroyAllWindows()

Open palm detected: Play/Pause triggered
Thumbs down detected: Rewind 5 secs
Thumbs down detected: Rewind 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs up detected: Fast forward 5 secs
Open palm detected: Play/Pause triggered
Thumbs up detected: Fast forward 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs down detected: Rewind 5 secs
Thumbs down detected: Rewind 5 secs
Thumbs down detected: Rewind 5 secs
Thumbs down detected: Rewind 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs up detected: Fast forward 5 secs
Thumbs down detected: Rewind 5 secs
Thumbs down detected: Rewind 5 secs
Thumbs down detected: Rewind 5 secs
Fist detected
Fist detected
X movement: 0.001312774419784557
Fist detected
X movement: 0.001286184787750222
Fist d