In [1]:
import cv2
import torch
import torch.nn as nn
import mediapipe as mp
import config

In [2]:


# Initialize and load the model
num_classes = len(config.class_names)
model = nn.Sequential(
    nn.Linear(42,64),
    nn.ReLU(),
    nn.Linear(64,128),
    nn.ReLU(),
    nn.Linear(128,64),
    nn.ReLU(),
    nn.Linear(64,32),
    nn.ReLU(),
    nn.Linear(32,num_classes),
    nn.Sigmoid()
)
model.load_state_dict(torch.load('hand_images_classifier.pth'))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [3]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

model.eval()

Sequential(
  (0): Linear(in_features=42, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=128, bias=True)
  (3): ReLU()
  (4): Linear(in_features=128, out_features=64, bias=True)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=32, bias=True)
  (7): ReLU()
  (8): Linear(in_features=32, out_features=11, bias=True)
  (9): Sigmoid()
)

In [4]:
# Open a connection to the camera
cap = cv2.VideoCapture(0)

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()

    if not ret:
        break
    
    vec = []

    copy_frame = frame.copy()
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process the frame and detect hands
    result = hands.process(rgb_frame)
    
    # Draw hand landmarks and bounding box
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Get bounding box coordinates
            x_max, y_max, x_min, y_min = 0, 0, frame.shape[1], frame.shape[0]
            for landmark in hand_landmarks.landmark:
                vec += [landmark.x,landmark.y]
                x = int(landmark.x * frame.shape[1])
                y = int(landmark.y * frame.shape[0])
                if x > x_max:
                    x_max = x
                if x < x_min:
                    x_min = x
                if y > y_max:
                    y_max = y
                if y < y_min:
                    y_min = y
            
            # Extract hand image and resize to 128x128
            x_span = x_max-x_min
            y_span = y_max-y_min
            x_mid = (x_max+x_min)//2
            y_mid = (y_max+y_min)//2
            span = int(0.6*max(x_span,y_span))

            y_min = y_mid - span
            y_max = y_mid + span
            x_min = x_mid - span
            x_max = x_mid + span

            if x_max > frame.shape[1]:
                x_max = frame.shape[1]
            if x_min < 0:
                x_min = 0
            if y_max > frame.shape[0]:
                y_max = frame.shape[0]
            if y_min < 0:
                y_min = 0

            hand_image = frame[y_min:y_max, x_min:x_max]
            cv2.rectangle(copy_frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)


            

            # Prepare the image for the model
            img = torch.tensor([vec],dtype=torch.float32,device=device)

            # Get the prediction
            with torch.no_grad():
                outputs = model(img)
                _, preds = torch.max(outputs, 1)
                prediction = config.class_names[preds[0].item()]
    else:
        prediction = '$'
    
    # Display the prediction
    font = cv2.FONT_HERSHEY_SIMPLEX
    cv2.putText(frame, f'Prediction: {prediction}', (10, 30), font, 1, (0, 255, 0), 2, cv2.LINE_AA)
    cv2.imshow('Camera Feed with Prediction', frame)
    
    # Break the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# When everything is done, release the capture and close windows
cap.release()
cv2.destroyAllWindows()


