In [1]:
# Importing packages 
import mediapipe as mp
import numpy as np
import os
import uuid
import cv2
import math
import subprocess

# necessary utilities
mp_drawing = mp.solutions.drawing_utils # for the nodes on our hands
mp_hands = mp.solutions.hands #for the hand model of mediapipe

In [2]:
def get_hand_label(index, hand, results):
    output = None
    for idx ,c in enumerate(results.multi_handedness):
        if(c.classification[0].index == index):
            # here we extracted necessary information of the correct hand with the help of index
            label  = c.classification[0].label
            score = round(c.classification[0].score,2)
            text = f"{label} ({score})"
            
            # using coordinates of the wrist
            coor = [hand.landmark[mp_hands.HandLandmark.WRIST].x, hand.landmark[mp_hands.HandLandmark.WRIST].y]

            # print(coor_to_print)
            output = text, coor
            
    return output

In [3]:
def print_joint_angle(image, results, joint_list):
    
    # Loop through hands
    for hand in results.multi_hand_landmarks:
        #Loop through joint sets 
        for joint in joint_list:
            a = np.array([hand.landmark[joint[0]].x, hand.landmark[joint[0]].y]) # First coord
            b = np.array([hand.landmark[joint[1]].x, hand.landmark[joint[1]].y]) # Second coord
            c = np.array([hand.landmark[joint[2]].x, hand.landmark[joint[2]].y]) # Third coord
            
            radians = np.arctan2(c[1] - b[1], c[0]-b[0]) - np.arctan2(a[1]-b[1], a[0]-b[0])
            angle = np.abs(radians*180.0/np.pi)
            
            if angle > 180.0:
                angle = 360-angle
                
            # window dimesions
            dim = [image.shape[1],image.shape[0]]
                
            cv2.putText(image, str(round(angle, 2)), tuple(np.multiply(b, dim).astype(int)),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2, cv2.LINE_AA)
    return image

In [4]:
# Hand detection
def print_hand_labels(image, num, hand, results):
    if(get_hand_label(num, hand, results)):
        text , coordinates = get_hand_label(num, hand, results)
        
        # window dimesions
        dim = [image.shape[1],image.shape[0]]
        
        # changing coordinates according to the window size
        coor_to_print = tuple(np.multiply(np.array((hand.landmark[mp_hands.HandLandmark.WRIST].x, hand.landmark[mp_hands.HandLandmark.WRIST].y)),
                    dim).astype(int))
        
        cv2.putText(image, text, coor_to_print, cv2.FONT_HERSHEY_SIMPLEX, 1, (1,0,0),2,cv2.LINE_AA)
        return text

In [5]:
# volume change in mac
def set_volume(volume):
    # Run AppleScript command to set the volume
    applescript = f'set volume output volume {int(volume)}'
    subprocess.Popen(['osascript', '-e', applescript], stdout=subprocess.PIPE, stderr=subprocess.PIPE)


# set_volume(80) 
#to set volume to 80%

In [6]:
def Calculate_volume_percentage(results):
    # Get the landmarks for thumb tip and index finger tip
    # hand_landmarks = results.multi_hand_landmarks[0]
    thumb_tip = results.multi_hand_landmarks[0].landmark[mp_hands.HandLandmark.THUMB_TIP]
    index_finger_tip = results.multi_hand_landmarks[0].landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]

    # Get the landmarks for the middle, ring, and pinky fingers
    middle_finger_tip = results.multi_hand_landmarks[0].landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
    ring_finger_tip = results.multi_hand_landmarks[0].landmark[mp_hands.HandLandmark.RING_FINGER_TIP]
    pinky_finger_tip = results.multi_hand_landmarks[0].landmark[mp_hands.HandLandmark.PINKY_TIP]
    wrist = results.multi_hand_landmarks[0].landmark[mp_hands.HandLandmark.WRIST]

    # Calculate the distance between thumb tip and index finger tip
    distance_thumb_index = math.sqrt((thumb_tip.x - index_finger_tip.x)**2 + (thumb_tip.y - index_finger_tip.y)**2)

    # Calculate the distances between the fingers and the wrist
    distance_middle_wrist = math.sqrt((middle_finger_tip.x - wrist.x)**2 + (middle_finger_tip.y - wrist.y)**2 )
    distance_ring_wrist = math.sqrt((ring_finger_tip.x - wrist.x)**2 + (ring_finger_tip.y - wrist.y)**2 )
    distance_pinky_wrist = math.sqrt((pinky_finger_tip.x - wrist.x)**2 + (pinky_finger_tip.y - wrist.y)**2 )

    # print(distance_thumb_index)
    # print(distance_middle_wrist,distance_ring_wrist, distance_pinky_wrist)

    # Set a threshold for finger proximity to the wrist and each other
    threshold_distance_rest_fingers = 0.300  # You may need to adjust this threshold based on your specific scenario

    distance_thumb_index_max = 0.3
    distance_thumb_index_min = 0.03

    # Check conditions and adjust the device volume
    if  (distance_middle_wrist+distance_pinky_wrist+distance_ring_wrist)/3 <= threshold_distance_rest_fingers:
        # Adjust the device volume based on the distance between thumb and index finger
        if(distance_thumb_index > distance_thumb_index_max):
            volume_percentage = 100
        elif(distance_thumb_index < distance_thumb_index_min):
            volume_percentage = 0
        else:
            volume_percentage = int((distance_thumb_index/distance_thumb_index_max)*100)
        # Implement code to change the device volume based on volume_percentage
        # print("Change volume to", volume_percentage, "%")
    else:
        volume_percentage = -1
        
    return volume_percentage


In [7]:
# For printing and changing volume of the device
def print_change_volume(image, results, hand):
    volume_percentage = Calculate_volume_percentage(results)
    
    if(volume_percentage != -1):
        set_volume(volume_percentage)
        
        text = f"{volume_percentage}%"
        # window dimesions
        dim = [image.shape[1],image.shape[0]]
        
        # dim_x = hand.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x
        # changing coordinates according to the window size
        coor_to_print = tuple(np.multiply(np.array((hand.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x, hand.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y)),
                    dim).astype(int))
        
        cv2.putText(image, text, coor_to_print, cv2.FONT_HERSHEY_SIMPLEX, 1, (1,0,0),2,cv2.LINE_AA)
        return image
        

In [8]:
#  Setting up the webcam
cap = cv2.VideoCapture(0)

# min_detection_condfidence for the accuracy on first time detection of our hand
# min_tracking_confidence for the accuracy while hand tracking after detection
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        # here frame is the image we get from our webcam
        ret, frame = cap.read()
        
        # flip image on the horizontal
        image = cv2.flip(frame,1)
        
        # cv2 captures image in BGR format which we want to convert into RGB
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Set flag to prevent any changes in the image
        image.flags.writeable = False
        
        # Detection using the mediapipe model
        results = hands.process(image)
        
        # Set flag to true to make changes in the image
        image.flags.writeable = True
        
        # RGB TO BGR
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        # print(results)
        
        # rendering the results onto our image
        if(results.multi_hand_landmarks):
            for num, hand in enumerate(results.multi_hand_landmarks):
                mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS,mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                                        mp_drawing.DrawingSpec(color=(250, 44, 250), thickness=2, circle_radius=2))
                
                # for hand detection
                text = print_hand_labels(image, num, hand, results)
                    
                # for angle prediction
                joint_list=[[8,5,12]]
                print_joint_angle(image, results, joint_list)
                
                # for volume percentage
                print(text)
                if(text != None):
                    if(text[0:4] == 'Left'):
                        print_change_volume(image, results, hand)

        # output the feed of our webcam to our screen
        cv2.imshow('Hand Tracking',image)
        
        # Save our image to a folder
        cv2.imwrite(os.path.join('output images', '{}.jpg'.format(uuid.uuid1())), image)
        
        # Stop when we press 'q'
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    
cap.release() 
cv2.destroyAllWindows()

I0000 00:00:1706519583.701277       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


Left (0.98)
Left (0.98)
None
Left (0.97)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)
Left (0.98)

In [9]:
# print(results.multi_hand_landmarks[0])
# results.multi_hand_landmarks[0].landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
# results.multi_hand_landmarks

# results.multi_handedness[1].classification[0]

: 