In [None]:
import cv2
import threading
import queue
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
import torchvision.transforms as transforms
import mediapipe as mp
import pyautogui

mphands = mp.solutions.hands
hands = mphands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5)

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

model_path = "./efficientnet1.pth"
classes_names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
                 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']
img_size = 224
model_name = "efficientnet_b3a"
num_classes = len(classes_names)


class SELFMODEL(nn.Module):
    def __init__(self, model_name, out_features=num_classes, pretrained=True):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        if model_name[:3] == "res":
            n_features = self.model.fc.in_features
            self.model.fc = nn.Linear(n_features, out_features)
        elif model_name[:3] == "vit":
            n_features = self.model.head.in_features
            self.model.head = nn.Linear(n_features, out_features)
        else:
            n_features = self.model.classifier.in_features
            self.model.classifier = nn.Linear(n_features, out_features)

    def forward(self, x):
        x = self.model(x)
        return x


# Define the gesture recognition function
def gesture_control(recognized_gestures):
    if 'space' in recognized_gestures:
        pyautogui.keyDown('e')
        pyautogui.keyUp('e')
    elif 'O' in recognized_gestures:
        pyautogui.keyDown('q')
        pyautogui.keyUp('q')
    elif 'N' in recognized_gestures:
        pyautogui.keyDown('w')
        pyautogui.keyUp('w')
    elif 'B' in recognized_gestures:
        pyautogui.keyDown('s')
        pyautogui.keyUp('s')
    elif 'S' in recognized_gestures:
        pyautogui.keyDown('a')
        pyautogui.keyUp('a')
    elif 'F' in recognized_gestures:
        pyautogui.keyDown('d')
        pyautogui.keyUp('d')


# Define the frame processing function
def process_frames():
    while True:
        frame = frame_queue.get()
        h, w, c = frame.shape
        framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = hands.process(framergb)
        hand_landmarks = result.multi_hand_landmarks
        if hand_landmarks:
            index = 0
            while index < len(hand_landmarks):
                handLMs = hand_landmarks[index]
                x_min, y_min, x_max, y_max = find_bounding_box(handLMs.landmark, w, h)
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                cropped_frame = frame[y_min:y_max, x_min:x_max]
                input_tensor = transform_frame(cropped_frame)
                predicted_class = classify_gesture(model, input_tensor, classes_names)
                gesture_control(classes_names[predicted_class])
                display_gesture_prediction(frame, classes_names[predicted_class], index)
                index += 1
        else:
            cv2.putText(frame, "nothing", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        cv2.imshow("Frame", frame)
        if cv2.waitKey(1) & 0xFF == 27:
            break


# Define the helper functions (find_bounding_box, transform_frame, classify_gesture, display_gesture_prediction)


def find_bounding_box(landmarks, w, h):
    x_min, y_min, x_max, y_max = float('inf'), float('inf'), float('-inf'), float('-inf')
    for lm in landmarks:
        x, y = int(lm.x * w), int(lm.y * h)
        x_min, y_min = min(x_min, x), min(y_min, y)
        x_max, y_max = max(x_max, x), max(y_max, y)
    # Add a margin of 50 pixels to the bounding box
    x_min, y_min = max(0, x_min - 50), max(0, y_min - 50)
    x_max, y_max = min(w, x_max + 50), min(h, y_max + 50)
    return x_min, y_min, x_max, y_max


def transform_frame(frame):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return transform(frame).unsqueeze(0)


def classify_gesture(model, input_tensor, classes_names):
    with torch.no_grad():
        output = model(input_tensor)
        probabilities = F.softmax(output[0], dim=0)
        predicted_class = torch.argmax(probabilities).item()
    return predicted_class


def display_gesture_prediction(frame, gesture_name, index):
    cv2.putText(frame, f"{gesture_name} for Hand {index + 1}", (10, 30 + 30 * index), cv2.FONT_HERSHEY_SIMPLEX, 1,
                (255, 0 + index * 100, 0), 2)


model = SELFMODEL(model_name=model_name, out_features=num_classes, pretrained=False)
weights = torch.load(model_path, map_location=torch.device('cuda'))
model.load_state_dict(weights)
model.eval()
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Camera not found or cannot be opened.")
index = 0

# Create a queue for passing frames between capture and processing threads
frame_queue = queue.Queue(maxsize=5)

# Create video capture object
cap = cv2.VideoCapture(0)


# Start capture thread
def capture_frames():
    while True:
        _, frame = cap.read()
        if not _:
            break
        frame_queue.put(frame)


# Start processing thread
processing_thread = threading.Thread(target=process_frames)
processing_thread.start()

# Start capture thread
capture_thread = threading.Thread(target=capture_frames)
capture_thread.start()

# Wait for threads to finish
capture_thread.join()
processing_thread.join()

# Release video capture object and destroy windows
cap.release()
cv2.destroyAllWindows()


In [1]:
import torch
import torch.nn as nn
import timm
import cv2
import torchvision.transforms as transforms
import torch.nn.functional as F
import mediapipe as mp
import time
import pyautogui



mphands = mp.solutions.hands
hands = mphands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5)


if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')


model_path = "./efficientnet1.pth"
classes_names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
                 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']
img_size = 224
model_name = "efficientnet_b3a"
num_classes = len(classes_names)


class SELFMODEL(nn.Module):
    def __init__(self, model_name, out_features=num_classes, pretrained=True):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        if model_name[:3] == "res":
            n_features = self.model.fc.in_features
            self.model.fc = nn.Linear(n_features, out_features)
        elif model_name[:3] == "vit":
            n_features = self.model.head.in_features
            self.model.head = nn.Linear(n_features, out_features)
        else:
            n_features = self.model.classifier.in_features
            self.model.classifier = nn.Linear(n_features, out_features)

    def forward(self, x):
        x = self.model(x)
        return x

def gesture_control(recognized_gestures):
    if 'space' in recognized_gestures:
        pyautogui.keyDown('e')
        pyautogui.keyUp('e')
    elif 'O' in recognized_gestures:
        pyautogui.keyDown('q')
        pyautogui.keyUp('q')
    elif 'N' in recognized_gestures:
        pyautogui.keyDown('w')
        pyautogui.keyUp('w')
    elif 'B' in recognized_gestures:
        pyautogui.keyDown('s')
        pyautogui.keyUp('s')
    elif 'S' in recognized_gestures:
        pyautogui.keyDown('a')
        pyautogui.keyUp('a')
    elif 'F' in recognized_gestures:
        pyautogui.keyDown('d')
        pyautogui.keyUp('d')
    
        #do nothing


def find_bounding_box(landmarks, w, h):
    x_min, y_min, x_max, y_max = float('inf'), float('inf'), float('-inf'), float('-inf')
    for lm in landmarks:
        x, y = int(lm.x * w), int(lm.y * h)
        x_min, y_min = min(x_min, x), min(y_min, y)
        x_max, y_max = max(x_max, x), max(y_max, y)
    # Add a margin of 50 pixels to the bounding box
    x_min, y_min = max(0, x_min - 50), max(0, y_min - 50)
    x_max, y_max = min(w, x_max + 50), min(h, y_max + 50)
    return x_min, y_min, x_max, y_max

def transform_frame(frame):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return transform(frame).unsqueeze(0)

def classify_gesture(model, input_tensor, classes_names):
    with torch.no_grad():
        output = model(input_tensor)
        probabilities = F.softmax(output[0], dim=0)
        predicted_class = torch.argmax(probabilities).item()
    return predicted_class

def display_gesture_prediction(frame, gesture_name, index):
    cv2.putText(frame, f"{gesture_name} for Hand {index + 1}", (10, 30 + 30 * index), cv2.FONT_HERSHEY_SIMPLEX, 1,
                (255, 0 + index * 100, 0), 2)


model = SELFMODEL(model_name=model_name, out_features=num_classes, pretrained=False)
weights = torch.load(model_path, map_location=torch.device('cuda'))
model.load_state_dict(weights)
model.eval()
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Camera not found or cannot be opened.")
index = 0

# Downsample factor (skip every N frames)
downsample_factor = 1
downsample_counter = 0
classA="nothing"

while True:

    _, frame = cap.read()
    h, w, c = frame.shape
    if not _:
        break
    # Downsample frames
    if downsample_counter % downsample_factor == 0:
        framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = hands.process(framergb)
        hand_landmarks = result.multi_hand_landmarks
        if hand_landmarks:
            index = 0
            while index < len(hand_landmarks):
                handLMs = hand_landmarks[index]
                x_min, y_min, x_max, y_max = find_bounding_box(handLMs.landmark, w, h)
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                cropped_frame = frame[y_min:y_max, x_min:x_max]
                input_tensor = transform_frame(cropped_frame)
                predicted_class = classify_gesture(model, input_tensor, classes_names)
                gesture_control(classes_names[predicted_class])
                classA= classes_names[predicted_class]
                display_gesture_prediction(frame, classes_names[predicted_class], index)
                index += 1
        else:
            cv2.putText(frame, "nothing", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        cv2.imshow("Frame", frame)
        if cv2.waitKey(1) & 0xFF == 27:
            break
    else:
            gesture_control(classA)

    downsample_counter += 1

cap.release()
cv2.destroyAllWindows()
