In [8]:
import mediapipe as mp #needs python 3.10 or less!!
import cv2

In [9]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode = False,          #it's a continuous video
    max_num_hands = 2,                  #2 hands - pretty self explanatory
    min_detection_confidence = 0.5,     #experiment?
    min_tracking_confidence = 0.5       #experiment?
)

mp_draw = mp.solutions.drawing_utils #to draw graph on the hands - not needed, just for our reference right now
#may or may not be needed to draw piano lines?

In [10]:
#constants

finger_tip_ids = [4, 8 ,12, 16, 20] #thumb, index, middle, ring, pinky - the indices of each finger...
white_keys = 15 #doing 3 Cs
white_width = 42 #taken from frame.shape[1] // white_keys, will need to change this based on app size
top_y, bottom_y = 240, 480 #the limits of the piano set up
black_keys = 13
#the top is the same because they all start at the same line, but the bottom is shorter to fit the idea of a black key
black_width = int(white_width * 0.6)
black_height = int((bottom_y - top_y) * 0.6) #manually identifying the right dimensions
black_bottom = top_y + black_height

white_key_dictionary = {0: 'C1', 1: 'D1', 2: 'E1', 3: 'F1', 4: 'G1', 5: 'A1', 6: 'B1', 7: 'C2', 8: 'D2', 9: 'E2', 10: 'F2', 11: 'G2', 12: 'A2', 13: 'B2', 14: 'C3'}
black_key_dictionary = {1: "C#1", 2: "D#1", 4: "F#1", 5: "G#1", 6: "A#1", 8: "C#2", 9: "D#2", 11: 'F#2', 12: "G#2", 13: "A#2"}

#this will be used later on to make sure that it fits into the dimensions
#of the key that we're trying to find
def is_point_in_rect(point, rect):
    x, y = point
    x1, y1, x2, y2 = rect
    return x1 <= x <= x2 and y1 <= y <= y2


In [11]:
capture = cv2.VideoCapture(0) #starts camera - if more than one camera, the index can vary
if not capture.isOpened():
    print("Error: Could not open your camera")
    exit()

In [None]:
key_notes = {}
#dictionary to be made - mapping key to sound note - FOR FUTURE

In [None]:
while True: #goes endlessly
    ret, frame = capture.read() #ret is a boolean value indicating if read correctly, frame is the Image
    if not ret:
        print("Error: Could not read the frame")
        break
    frame = cv2.flip(frame, 1)  # Flip horizontally for selfie-view
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)

    #drawing the middle/piano line
    cv2.line(img = frame, pt1 = (0, round(frame.shape[0] / 2)), pt2 = (frame.shape[1], round(frame.shape[0] / 2)), color = (255, 0, 0), thickness = 5)

    #WHITE KEYS (-1 for solid white, 2 for black outline):
    #cv2.rectangle(frame [in which we're inputting], x dimension, y dimension, color, -1 for white fit, 2 for black outline)
    for i in range(len(white_key_dictionary)):
        cv2.rectangle(frame, (i * white_width, top_y), ((i + 1) * white_width, bottom_y), (255, 255, 255), -1)
        cv2.rectangle(frame, (i * white_width, top_y), ((i + 1) * white_width, bottom_y), (0, 0, 0), 2)
    #BLACK KEYS:
    #cv2.rectangle(frame [which we're putting black keys into], x dimension, y dimension, black color, -1 for black color)
    for key in black_key_dictionary:
        cv2.rectangle(frame, (key * white_width - black_width // 2, top_y), (key * white_width + black_width // 2, black_bottom), (0, 0, 0), -1)


#####FROM RIA - OPTIMIZED TILL HERE: need to make changes from here, add notes, shorten ####

    # important because this is what we'll call when we reach within the range
    # of the notes
    # we can combine this with audio files to produce the sound
    white_labels = ["C", "D", "E", "F", "G", "A", "B",
                "C", "D", "E", "F", "G", "A", "B", "C"]
    black_labels = ["C#", "D#", "F#", "G#", "A#",
                "C#", "D#", "F#", "G#", "A#",
                "C#", "D#", "F#", "G#"]
    
    #to make sure we're tracking properly we have two variables
    # last pressed -- keeps track of the key that was last played
    # finger below line -- tracks where the finger touches from
    last_pressed = None
    finger_below_line = False

    #connects each rectangle with their respective key
    # this is what we can do for audio too make an additional one
    white_rectangles = []
    for i in range(white_keys):
        white_rectangles.append((white_labels[i], (i * white_width, top_y, (i + 1) * white_width, bottom_y)))
        #using the same formula that was used above manually
        #it takes the letter, start of key, end of key and inputs all of this into the white rectangles
        # so essentially when we access later we just have to check if it fits in that area
        # name, (dimensions)

    black_rectangles = []
    black_pos = [1, 2, 4, 5, 6, 8, 9, 11, 12, 13]
    # so we're able to keep track between which keys the black ones come
    # because they require more tracking than white due to their positions
    for i, pos in enumerate(black_pos):
        #following the same formula used above
        #then we store that into the black_rectangles
        black_rectangles.append((black_labels[i], (pos * white_width - black_width // 2, top_y,
                pos * white_width + black_width // 2, black_bottom)))
        #name, (dimensions)


    if result.multi_hand_landmarks:

        for hand_landmarks in result.multi_hand_landmarks:
            # Draw hand landmarks on the frame
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS) #the circles

            # Get landmark positions
            h, w, _ = frame.shape
            landmarks = hand_landmarks.landmark

            # Example: Print fingertip coordinates
            for tip_id in finger_tip_ids:
                x = int(landmarks[tip_id].x * w)
                y = int(landmarks[tip_id].y * h)
                cv2.circle(frame, (x, y), 5, (0, 255, 0), cv2.FILLED)

                
                if y > top_y:
                    if not finger_below_line:  # finger just crossed below line
                        pressed_key = None

                        # checking the black keys first since they have more detail
                        for label, rect in black_rectangles:
                            if is_point_in_rect((x, y), rect):
                                pressed_key = label
                                break

                        # now checking white keys since no black keys match
                        if not pressed_key:
                            for label, rect in white_rectangles:
                                if is_point_in_rect((x, y), rect):
                                    pressed_key = label
                                    break

                        if pressed_key and pressed_key != last_pressed:
                            print(f"pressed {pressed_key}")
                            last_pressed = pressed_key

                        finger_below_line = True
                else:
                    # resetting after finger moved above line
                    finger_below_line = False
                    last_pressed = None

            
    
    cv2.imshow('Finger Tracking', frame) #shows every frame on the screen, Finger Tracking is the title, can be a cool other name as well.

    if (cv2.waitKey(5) & 0xFF == ord('i')): #if i (I chose it for Illinois, lol) is clicked, it terminates the window
        #this is read every 5 ms - may have to increase this time because taking a reading every 5ms can be computationally exhaustive.
        break

capture.release() #frees up the resources/camera - camera won't be allowed to be used otherwise
cv2.destroyAllWindows() #closes the window automatically - can stop this