In [1]:
import cv2
import mediapipe as mp
import time
import os
import torch
import config

nums = config.class_names

positions = []
labels = []

count = 7
previous_exist = False
if os.path.exists('labs.tensor'):
    previous_exist = True

# Initialize the webcam
cap = cv2.VideoCapture(0)

# Initialize MediaPipe hand detection
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
    if not ret:
        break
    copy_frame = frame.copy()
    # Convert the frame to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process the frame and detect hands
    result = hands.process(rgb_frame)
    
    vec = []

    # Draw hand landmarks and bounding box
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Get bounding box coordinates
            x_max, y_max, x_min, y_min = 0, 0, frame.shape[1], frame.shape[0]
            for landmark in hand_landmarks.landmark:
                vec += [landmark.x,landmark.y]
                x = int(landmark.x * frame.shape[1])
                y = int(landmark.y * frame.shape[0])
                if x > x_max:
                    x_max = x
                if x < x_min:
                    x_min = x
                if y > y_max:
                    y_max = y
                if y < y_min:
                    y_min = y
            
            # Extract hand image and resize to 128x128
            x_span = x_max-x_min
            y_span = y_max-y_min
            x_mid = (x_max+x_min)//2
            y_mid = (y_max+y_min)//2
            span = int(0.7*max(x_span,y_span))

            y_min = y_mid - span
            y_max = y_mid + span
            x_min = x_mid - span
            x_max = x_mid + span

            if x_max > frame.shape[1]:
                x_max = frame.shape[1]
            if x_min < 0:
                x_min = 0
            if y_max > frame.shape[0]:
                y_max = frame.shape[0]
            if y_min < 0:
                y_min = 0

            hand_image = copy_frame[y_min:y_max, x_min:x_max]
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
    
    cv2.putText(frame, f'Show : {nums[count % len(nums)]}', (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 0), 2, cv2.LINE_AA)
    
    # Display the resulting frame
    cv2.imshow('Hand Detection', frame)
    
    tmp = cv2.waitKey(1) & 0xFF

    # Break the loop on 'Enter' key press
    if tmp == ord('\r'):
        p = torch.tensor(positions)
        l = torch.tensor(labels)
        if previous_exist:
            p = torch.cat((p,torch.load('pos.tensor')),dim=0)
            l = torch.cat((l,torch.load('labs.tensor')),dim=0)
        torch.save(p,'pos.tensor')
        torch.save(l,'labs.tensor')
        break
    
    if tmp == ord('e'):
        if(vec == []):
            continue
        positions.append(vec)
        labels.append(count % len(nums))
        #print(vec,labels[-1])
    
    if tmp == ord('q'):
        count += 1

# Release the webcam and close windows
cap.release()
cv2.destroyAllWindows()


