In [1]:
import pickle #imports the 'pickle' module, which provides functionality for serializing and deserializing Python objects
import cv2 #imports the 'cv2' module, which is the OpenCV (Open Source Computer Vision) library for image and video processing
import mediapipe as mp #imports the 'mediapipe' module that offers a collection of pre-built, customizable machine learning models for various media processing tasks
import numpy as np #imports the 'numpy' module and assigns it the alias 'np'

model_dict1 = pickle.load(open('./model_ASL2.p', 'rb')) #loads the contents of the file "model_ASL.p" using the 'pickle.load()'
model1 = model_dict1['model1'] # retrieves the value associated with the key 'model1' from the 'model_dict1' dictionary


cap = cv2.VideoCapture(0)  #initializes a video capture object using the default camera (index 0) connected to the computer

mp_hands = mp.solutions.hands #The 'mp.solutions.hands' module is part of the 'MediaPipe' library and provides functionality for hand tracking.
mp_drawing = mp.solutions.drawing_utils  #This module is a part of the MediaPipe library and contains utilities for drawing landmarks and other visualizations on images
mp_drawing_styles = mp.solutions.drawing_styles #This module is a part of the MediaPipe library and provides predefined styles for drawing landmarks and connections with different colors

'''
1. 'static_image_mode=True': This parameter specifies that the hand tracking should be performed in static image mode. 
2. 'max_num_hands=2': This parameter sets the maximum number of hands to detect and track. 
3. 'min_detection_confidence=0.3': This parameter sets the minimum confidence value for hand detection. It determines 
the threshold for considering a hand as successfully detected.
'''
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.9)

#list of all the datasets are labelled here
labels_dict1 = {
                0: "You",
                1: "Power"        
                }

while True: #this loop continuously captures frames from a video stream and performs hand tracking and classification on each frame
    #These lines initialize empty lists data_aux, x_, and y_ to store the hand landmark data and coordinates
    data_aux = [] 
    x_ = []
    y_ = []

    '''
    reads a frame from the video capture object 'cap'. It captures the frame from the default camera and assigns it to the variable frame. 
    The return value 'ret' indicates whether the frame was successfully read
    '''
    ret, frame = cap.read() 

    H, W, _ = frame.shape #retrieve the height (H) and width (W) of the frame. _ is used to discard the third dimension, as the frame is in BGR format
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) #line converts the frame from BGR (Blue-Green-Red) color space to RGB (Red-Green-Blue) color space.
    results = hands.process(frame_rgb) #This line processes the RGB frame using the hands object (initialized earlier) to detect and track hands
    
    if results.multi_hand_landmarks: #This condition checks if hand landmarks are detected in the frame. If there are hand landmarks available, the condition evaluates to True
        n = len(results.multi_hand_landmarks) #To store the number of hands detected
        '''
        These lines iterate over each detected hand in the frame and use the 'mp_drawing.draw_landmarks()' function 
        to draw landmarks and connections on the frame image. It visualizes the hand landmarks and connections using 
        the specified drawing styles.
        ''' 
        for hand_landmarks in results.multi_hand_landmarks: 
            mp_drawing.draw_landmarks(
                frame,  # image to draw
                hand_landmarks,  # model output
                mp_hands.HAND_CONNECTIONS,  # hand connections
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style())
            
        '''
        These lines iterate over each detected hand again and retrieve the 'x' and 'y' coordinates of each landmark. 
        These coordinates are appended to the 'x_' and 'y_' lists. Then, the normalized coordinates (subtracting the minimum values) 
        are appended to the 'data_aux' list.
        '''
        for hand_landmarks in results.multi_hand_landmarks:
            for i in range(len(hand_landmarks.landmark)):
                x = hand_landmarks.landmark[i].x
                y = hand_landmarks.landmark[i].y
                x_.append(x)
                y_.append(y)
            for i in range(len(hand_landmarks.landmark)):
                x = hand_landmarks.landmark[i].x
                y = hand_landmarks.landmark[i].y
                data_aux.append(x - min(x_))
                data_aux.append(y - min(y_))

        if n==1: #to determine if it corresponds to a single hand or multiple hands
            
            x1 = int(min(x_) * W) - 10 #calculates the x-coordinate of the top-left corner of the bounding box
            y1 = int(min(y_) * H) - 10 #calculates the y-coordinate of the top-left corner of the bounding box
            x2 = int(max(x_) * W) - 10 #calculates the x-coordinate of the bottom-right corner of the bounding box
            y2 = int(max(y_) * H) - 10 #calculates the y-coordinate of the bottom-right corner of the bounding box

            '''
            'model1.predict([np.asarray(data_aux)])' calls the 'predict()' method of the 'model1' object, passing the list of input samples. 
            The 'predict()' method uses the trained model to make predictions on the input data.
            '''
            prediction1 = model1.predict([np.asarray(data_aux)])

            '''
            1. 'prediction1[0]' retrieves the first element of the prediction1 array. Since prediction1 is a one-dimensional array, 
            this expression retrieves the predicted label or class value.
            2. 'int(prediction1[0])' converts the predicted value to an integer. This step is necessary if the predicted value is 
            of a different data type, such as a floating-point number.
            3. 'labels_dict1' is a dictionary that maps the integer labels to corresponding character labels. The integer label retrieved 
            in the previous step is used as the key to access the corresponding character label.
            4. The character label retrieved from the dictionary is assigned to the variable predicted_character1
            '''
            predicted_character1 = labels_dict1[int(prediction1[0])]

            cv2.rectangle(frame, (x1, y1), (x2, y2), (225, 225, 225), 4) #bounding box made with the different co-ordinates of hands
            '''
            1. 'frame' is the image frame on which the text will be added.
            2. 'predicted_character1' is the predicted character label that will be displayed as text.
            3. '(x1, y1 - 10)' specifies the coordinates where the text will be positioned. 
            4. 'cv2.FONT_HERSHEY_SIMPLEX' is the font type to be used for the text.
            5. 1.3 is the font scale, which determines the size of the text.
            6. (255, 255, 255) is the color of the text, specified as a tuple in BGR format. In this case, it represents white color.
            7. '3' is the thickness of the text.
            8. cv2.LINE_AA specifies the line type. It indicates that anti-aliased lines should be used to render the text, resulting in smoother and more visually appealing text.
            '''
            cv2.putText(frame, predicted_character1, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (255, 255, 255), 3, cv2.LINE_AA)

        else: #if the number of hands is equal to 2
            #bounding box calculation 
             x1 = int(min(x_) * W) - 10 
             y1 = int(min(y_) * H) - 10
             x2 = int(max(x_) * W) - 10
             y2 = int(max(y_) * H) - 10
             cv2.rectangle(frame, (x1, y1), (x2, y2), (128, 0, 0), 4)
             cv2.putText(frame, "two hands detected", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (128, 0, 0), 3, cv2.LINE_AA)

    '''
    These lines of code together ensure that the annotated frame is displayed in a window, 
    and the program continues to update the display and listen for keyboard input until a key is pressed or the program is terminated.
    '''
    cv2.imshow('frame', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    


KeyboardInterrupt: 

In [2]:
#destroys all the windows of opencv
cap.release()
cv2.destroyAllWindows()