In [1]:
import cv2
import numpy as np
import torchvision.transforms as transforms
import pickle
import os
import torch
from torch.utils.data import Dataset
import torch.nn.functional as F
import torch.nn as nn
import mediapipe as mp
import torchvision


In [2]:
def process(image, hands):
    # Process the image and get results
    results = hands.process(image)

    # Convert RGB back to BGR for OpenCV
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Get image dimensions
            h, w, _ = image.shape

            # Extract landmark positions
            landmark_array = [(int(lm.x * w), int(lm.y * h)) for lm in hand_landmarks.landmark]
            if landmark_array:

                # Get bounding box coordinates
                x_coords = [pt[0] for pt in landmark_array]
                y_coords = [pt[1] for pt in landmark_array]

                xmin, xmax = min(x_coords)-20, max(x_coords)+20
                ymin, ymax = min(y_coords)-20, max(y_coords)+20

                # Draw the bounding box
                #cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
                
                # Crop the bounding box from the image
                cropped_image = image[ymin:ymax, xmin:xmax].copy()

                # Resize the cropped image to a fixed size (e.g., 150x150)
                #cropped_image = cv2.resize(cropped_image, (150, 150))
                if cropped_image.size != 0:
                    # Convert the cropped image to grayscale
                    cropped_image = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2GRAY)

                    # Surround the crop with dark gray pixels
                    bordered_image = cv2.copyMakeBorder(
                        cropped_image, 100, 100, 100, 100, cv2.BORDER_CONSTANT, value=1
                    )

                    # Resize the bordered image to ensure it is 150x150
                    bordered_image = cv2.resize(bordered_image, (150, 150))

                    # Make the hand region whiter
                    '''
                    bordered_image = cv2.normalize(
                        bordered_image, None, alpha=200, beta=255, norm_type=cv2.NORM_MINMAX
                    )
                    '''
                    

                    # Display the processed image
                    cv2.imshow("Cropped Hand with Border", bordered_image) 

                    return bordered_image
    else:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        return image

In [3]:
class CNNModel(nn.Module):
    def __init__(self, num_classes=22):  # Change num_classes as needed
        super(CNNModel, self).__init__()

        # Conv Layer 1: Input (1, 150, 150) -> Output (32, 148, 148) -> Pool (32, 74, 74)
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=0)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # Conv Layer 2: Output (64, 72, 72) -> Pool (64, 36, 36)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(64)

        # Conv Layer 3: Output (128, 34, 34) -> Pool (128, 17, 17)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3)
        self.bn3 = nn.BatchNorm2d(128)

        # Fully Connected Layers
        self.fc1 = nn.Linear(128*17*17, 512)  # Flattened input size
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))  # Conv1 -> ReLU -> Pool
        x = self.pool(F.relu(self.bn2(self.conv2(x))))  # Conv2 -> ReLU -> Pool
        x = self.pool(F.relu(self.bn3(self.conv3(x))))  # Conv3 -> ReLU -> Pool

        x = x.view(x.shape[0], -1)  # Flatten for fully connected layers
        x = F.relu(self.fc1(x))  # Fully Connected Layer 1 -> ReLU
        x = self.fc2(x)  # Output layer
        return x


In [4]:

PATH = os.path.join(os.getcwd(),'..','..','archive', 'LIS-fingerspelling-dataset')
SAVEPATH = os.path.join(os.getcwd(), 'checkpoint','cnn', 'weights')
REVERSEPATH = os.path.join(os.getcwd(), 'checkpoint','cnn','reverselookup.pickle')


with open(REVERSEPATH, 'rb') as handle:
    reverselookup = pickle.load(handle)

if not reverselookup:
    print("Error: Could not load reverse lookup table.")
    exit()


# Initialize MediaPipe
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils


model = CNNModel(num_classes=len(reverselookup.keys()))
model.load_state_dict(torch.load(SAVEPATH, weights_only=True))

# Define the transform (same as for single-image classification)
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((150, 150)),
    transforms.ToTensor(),
])

cap = cv2.VideoCapture(0)

with mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Convert BGR to RGB
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Flip the image for a selfie-view display (optional)
        image = cv2.flip(image, 1)

        #Process image to make it look like dataset
        image = process(image, hands)


        input = transform(image).unsqueeze(0) # Add batch dim


        # Predict
        outputs = model(input)
        _, predicted = torch.max(outputs, 1)
        letter = reverselookup[predicted.item()]

        


        # Display prediction
        cv2.putText(frame, f"Prediction: {letter}", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)


        cv2.imshow('Image Recognition', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1744673220.187450   50847 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744673220.201860   50911 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: AMD Radeon Graphics (radeonsi, renoir, LLVM 19.1.1, DRM 3.59, 6.11.0-19-generic)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1744673220.265282   50893 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1744673220.301960   50895 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
qt.qpa.plugin: Could not find the Qt platform plugin "wayland" in "/home/thomas/.cache/pypoetry/virtualenvs/lis-L4bSUyJ--py3.12/lib/python3.12/site-packages/cv2/qt/plugins"
W0000 00:00:1744673224.260643   50897 landmark_projection_calculator.cc:186] Using NORM_