In [1]:
import cv2
import mediapipe as mp
import math 
import numpy

from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume

# Access the system's audio output device using PyCaw
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))

# Set initial system volume to a lower level (-37 dB)
volume.SetMasterVolumeLevel(-37.0, None)

# Initialize webcam
capture = cv2.VideoCapture(0)

# Mediapipe hand tracking setup
mp_draw = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

while True:
    success, img = capture.read()
    imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Detecting the hand
    results = hands.process(imgRGB)

    """If palm has been detected successfully,
       we loop through the landmarks to select the 2 fingertip positions needed"""
    if results.multi_hand_landmarks:
        for handLms in results.multi_hand_landmarks:
            lmList = []
            h, w, _ = img.shape
            for id, lm in enumerate(handLms.landmark):
                # Convert normalized coordinates to pixel values
                cx, cy = int(lm.x * w), int(lm.y * h)
                lmList.append([id, cx, cy])

            if lmList:
                # Map the coordinates of the thumb tip (ID 4)
                x1, y1 = lmList[4][1], lmList[4][2]
                # Map the coordinates of the index finger tip (ID 8)
                x2, y2 = lmList[8][1], lmList[8][2]

                # Draw visual markers for thumb and index finger
                cv2.circle(img, (x1, y1), 10, (0, 255, 0), cv2.FILLED)     # Green for thumb
                cv2.circle(img, (x2, y2), 10, (255, 0, 0), cv2.FILLED)     # Blue for index
                cv2.line(img, (x1, y1), (x2, y2), (0, 0, 255), 3)          # Red line between them

                # Calculate the distance between the two fingertips
                distance = math.hypot(x2 - x1, y2 - y1)

                # Get system's volume range
                volRange = volume.GetVolumeRange()
                minVol = volRange[0]  # Typically -65 dB
                maxVol = volRange[1]  # Typically 0 dB

                # Interpolate distance to volume levels
                vol = numpy.interp(distance, [30, 250], [minVol, maxVol])       # For system volume
                volPer = numpy.interp(distance, [30, 250], [0, 100])            # For UI %
                volBar = numpy.interp(distance, [30, 250], [400, 150])          # For volume bar UI
                volBar = max(min(volBar, 400), 150)                             # Clamp for safety

                # Set the actual system volume
                volume.SetMasterVolumeLevel(vol, None)

                # Display volume percentage
                cv2.putText(img, f'{int(volPer)}%', (50, 430), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 0), 3)
                # Draw volume bar outline
                cv2.rectangle(img, (50, 150), (85, 400), (123, 213, 122), 3)
                # Fill the volume bar according to gesture
                cv2.rectangle(img, (50, int(volBar)), (85, 400), (0, 231, 23), cv2.FILLED)

    # Display the webcam feed with overlays
    cv2.imshow("Motion Sound Control", img)

    # Exit condition: press 'q' or click the ❌ close button
    key = cv2.waitKey(1) & 0xFF
    window_open = cv2.getWindowProperty("Motion Sound Control", cv2.WND_PROP_VISIBLE) >= 1
    if key == ord('q') or not window_open:
        break

# Release the webcam and destroy all OpenCV windows
capture.release()
cv2.destroyAllWindows()

"""
Summary:
- length = 30–250 pixels (distance between fingers) ---> mapped to system volume 0–100%
- system volume range = typically from -65 dB to 0 dB
- distance = 30 --> volume = 0%, distance = 250 --> volume = 100%
"""
