In [1]:
import os
import cv2
import numpy as np
import mediapipe as mp
import json
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
IMAGES_DIR = './images'
JSON_DIR = './json/'
JSON_DATA_DIR = './json/hand_landmarks_data.json'
JOBLIB_SAVE_DIR = './joblib/'
JOBLIB_DATA_DIR = './joblib/hand_gesture_model.joblib'
GESTURE_MACHINE_DIR = './json/Gesture_Machine.json'
single_hand_labels = ['Hallo', 'danke', 'ich', 'bin', 'S', 'E', 'R', 'T', 'A', 'N', 'viel', 'spass']
double_hand_labels = ['das ist','und', 'Video', 'Arbeit', 'Abgabe']

In [12]:
def capture_images(IMAGES_DIR, single_hand_labels, double_hand_labels, num_images_per_label=50):
    if not os.path.exists(IMAGES_DIR):
        os.makedirs(IMAGES_DIR)

    cap = cv2.VideoCapture(0)

    for label in single_hand_labels:
        print(f"Bereit für Einzelhandzeichen: {label}. Drücke 'g', um Bilder aufzunehmen.")
        while True:
            ret, frame = cap.read()
            cv2.imshow('frame', frame)
            if cv2.waitKey(1) & 0xFF == ord('g'):
                # 3 Sekunden Countdown
                for i in range(3, 0, -1):
                    print(f"Capturing in {i} seconds...")
                    time.sleep(1)

                # Aufnahme für die rechte Hand
                label_dir = os.path.join(IMAGES_DIR, label)
                os.makedirs(label_dir, exist_ok=True)
                for i in range(num_images_per_label):
                    ret, frame = cap.read()
                    img_name = f"{label_dir}/{label}_right_{i}.jpg"
                    cv2.imwrite(img_name, frame)
                    print(f"Captured {img_name}")
                    cv2.imshow('frame', frame)
                    cv2.waitKey(100)

                print(f"Fertig mit dem ersten Satz für {label}. Jetzt die andere Hand bereit machen und 'g' drücken, um fortzufahren.")
            
                while True:
                    if cv2.waitKey(1) & 0xFF == ord('g'):
                        break

                # Aufnahme für die linke Hand
                for i in range(num_images_per_label):
                    ret, frame = cap.read()
                    img_name = f"{label_dir}/{label}_left_{i}.jpg"
                    cv2.imwrite(img_name, frame)
                    print(f"Captured {img_name}")
                    cv2.imshow('frame', frame)
                    cv2.waitKey(100)

                print(f"Fertig mit Einzelhandzeichen: {label}. Drücke 'g' für das nächste Label.")
                break

    # Bildaufnahme für Doppelhandzeichen
    for label in double_hand_labels:
        print(f"Bereit für Doppelhandzeichen: {label}. Drücke 'g', um Bilder aufzunehmen.")
        while True:
            ret, frame = cap.read()
            cv2.imshow('frame', frame)
            if cv2.waitKey(1) & 0xFF == ord('g'):
                # 3 Sekunden Countdown
                for i in range(3, 0, -1):
                    print(f"Capturing in {i} seconds...")
                    time.sleep(1)

                # Aufnahme für Doppelhand
                label_dir = os.path.join(IMAGES_DIR, label)
                os.makedirs(label_dir, exist_ok=True)
                for i in range(num_images_per_label):
                    ret, frame = cap.read()
                    img_name = f"{label_dir}/{label}_{i}.jpg"
                    cv2.imwrite(img_name, frame)
                    print(f"Captured {img_name}")
                    cv2.imshow('frame', frame)
                    cv2.waitKey(100)

                print(f"Fertig mit Doppelhandzeichen: {label}. Drücke 'g' für das nächste Label.")
                break

    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)

capture_images(IMAGES_DIR, single_hand_labels, double_hand_labels)

Bereit für Doppelhandzeichen: Das ist. Drücke 'g', um Bilder aufzunehmen.
Capturing in 3 seconds...
Capturing in 2 seconds...
Capturing in 1 seconds...
Captured ./images/Das ist/Das ist_0.jpg
Captured ./images/Das ist/Das ist_1.jpg
Captured ./images/Das ist/Das ist_2.jpg
Captured ./images/Das ist/Das ist_3.jpg
Captured ./images/Das ist/Das ist_4.jpg
Captured ./images/Das ist/Das ist_5.jpg
Captured ./images/Das ist/Das ist_6.jpg
Captured ./images/Das ist/Das ist_7.jpg
Captured ./images/Das ist/Das ist_8.jpg
Captured ./images/Das ist/Das ist_9.jpg
Captured ./images/Das ist/Das ist_10.jpg
Captured ./images/Das ist/Das ist_11.jpg
Captured ./images/Das ist/Das ist_12.jpg
Captured ./images/Das ist/Das ist_13.jpg
Captured ./images/Das ist/Das ist_14.jpg
Captured ./images/Das ist/Das ist_15.jpg
Captured ./images/Das ist/Das ist_16.jpg
Captured ./images/Das ist/Das ist_17.jpg
Captured ./images/Das ist/Das ist_18.jpg
Captured ./images/Das ist/Das ist_19.jpg
Captured ./images/Das ist/Das ist_20.j

In [16]:
def augment_image(img):
    rows, cols, _ = img.shape

    # herauszoomen
    scale = np.random.uniform(1.0, 1.3)
    img = cv2.resize(img, None, fx=scale, fy=scale)

    # Zufällige Verschiebung (nur nach links und rechts)
    max_dx = 0.1 * cols
    dx = np.random.uniform(-max_dx, max_dx)
    M = np.float32([[1, 0, dx], [0, 1, 0]])
    img = cv2.warpAffine(img, M, (cols, rows))

    return img

def augment_and_save_images(img_dir, num_augmented_images):
    for img_name in os.listdir(img_dir):
        img_path = os.path.join(img_dir, img_name)
        if os.path.isfile(img_path):
            img = cv2.imread(img_path)
            
            for i in range(num_augmented_images):
                augmented_img = augment_image(img)
                 
                augmented_img_name = f"{img_name.split('.')[0]}_aug_{i}.jpg"
                augmented_img_path = os.path.join(img_dir, augmented_img_name)
                
                cv2.imwrite(augmented_img_path, augmented_img)
                print(f"Augmentiertes Bild gespeichert: {augmented_img_path}")

for label in single_hand_labels:
    label_path = os.path.join(IMAGES_DIR, label)
    if os.path.isdir(label_path):
        augment_and_save_images(label_path, num_augmented_images=5)

for label in double_hand_labels:
    label_path = os.path.join(IMAGES_DIR, label)
    if os.path.isdir(label_path):
        augment_and_save_images(label_path, num_augmented_images=5)

Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_left_10_aug_4_aug_0.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_left_10_aug_4_aug_1.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_left_10_aug_4_aug_2.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_left_10_aug_4_aug_3.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_left_10_aug_4_aug_4.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_right_46_aug_4_aug_0.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_right_46_aug_4_aug_1.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_right_46_aug_4_aug_2.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_right_46_aug_4_aug_3.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_right_46_aug_4_aug_4.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_left_23_aug_2_aug_0.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_left_23_aug_2_aug_1.jpg
Augmentiertes Bild gespeichert: ./images/Hallo/Hallo_left_23_aug_2_aug_

KeyboardInterrupt: 

In [17]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

if not os.path.exists(JSON_DIR):
    os.makedirs(JSON_DIR)

def save_landmarks_to_json(IMAGES_DIR, json_path):
    if not os.path.exists(IMAGES_DIR):
        print(f"Das Verzeichnis {IMAGES_DIR} existiert nicht.")
        return

    data = []
    labels_collected = []

    # Verarbeite Einzelhandzeichen
    for label in single_hand_labels:
        label_dir = os.path.join(IMAGES_DIR, label)
        if os.path.isdir(label_dir):
            for img_name in os.listdir(label_dir):
                img_path = os.path.join(label_dir, img_name)
                img = cv2.imread(img_path)
                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                
                results = hands.process(img_rgb)
                if results.multi_hand_landmarks:
                    for hand_landmarks in results.multi_hand_landmarks:
                        landmarks = [[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]
                        data.append(landmarks)
                        labels_collected.append(label)

    # Verarbeite Doppelhandzeichen
    for label in double_hand_labels:
        label_dir = os.path.join(IMAGES_DIR, label)
        if os.path.isdir(label_dir):
            for img_name in os.listdir(label_dir):
                img_path = os.path.join(label_dir, img_name)
                img = cv2.imread(img_path)
                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

                results = hands.process(img_rgb)
                if results.multi_hand_landmarks and len(results.multi_hand_landmarks) == 2:
                    for hand_landmarks in results.multi_hand_landmarks:
                        landmarks = [[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]
                        data.append(landmarks)
                        labels_collected.append(label)

    with open(json_path, 'w') as f:
        json.dump({'data': data, 'labels': labels_collected}, f)

json_path = os.path.join(JSON_DIR, 'hand_landmarks_data.json')
save_landmarks_to_json(IMAGES_DIR, json_path)

print("Handlandmarks erfolgreich gespeichert.")


I0000 00:00:1735154397.964649 1342909 gl_context.cc:357] GL version: 2.1 (2.1 INTEL-18.8.16), renderer: Intel(R) Iris(TM) Graphics 6100
W0000 00:00:1735154398.064961 1351743 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1735154398.099230 1351742 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Handlandmarks erfolgreich gespeichert.


In [18]:
with open(JSON_DATA_DIR, 'r') as f:
    dataset = json.load(f)

data = dataset['data']
labels = dataset['labels']

X = np.array(data)
X = X.reshape(X.shape[0], -1)
y = np.array(labels)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Genauigkeit: {accuracy * 100:.2f}%")
print(f"Präzision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")

if not os.path.exists(JOBLIB_SAVE_DIR):
    os.makedirs(JOBLIB_SAVE_DIR)

joblib.dump(model, JOBLIB_SAVE_DIR + 'hand_gesture_model.joblib')


Genauigkeit: 99.94%
Präzision: 99.94%
Recall: 99.94%
F1-Score: 99.94%


['./joblib/hand_gesture_model.joblib']

In [3]:
class GestureStateMachine:
    def __init__(self, config_file):
        with open(config_file, 'r') as file:
            self.transition_table = json.load(file)
        self.state = 'start'
        self.recognized_gestures = []

    def transition(self, gesture):
        if self.state in self.transition_table:
            transitions = self.transition_table[self.state]
            if gesture in transitions:
                action = transitions[gesture]
                if isinstance(action, list):
                    if "clear_sequence" in action:
                        sequence_length = 5
                        self.recognized_gestures = self.recognized_gestures[:-sequence_length]
                        self.recognized_gestures.append(action[0])
                        self.state = action[1]
                    else:
                        self.recognized_gestures[-1] = action[0]
                        self.state = action[1]
                else:  # Wenn nur ein Zustand angegeben ist
                    self.recognized_gestures.append(gesture)
                    self.state = action
            else:
                default_action = transitions.get('default')
                if default_action:
                    self.recognized_gestures[-1] = default_action[0]
                    self.recognized_gestures.append(gesture)
                    self.state = default_action[1]
                else:
                    self.recognized_gestures.append(gesture)
                    self.state = 'start'
        else:
            self.recognized_gestures.append(gesture)
            self.state = 'start'

    def get_recognized_gestures(self):
        return self.recognized_gestures


In [4]:
model = joblib.load(JOBLIB_DATA_DIR)
gesture_state_machine = GestureStateMachine(GESTURE_MACHINE_DIR)
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)

recognized_gestures = []  
last_gesture = None 
last_recognition_time = 0  

cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    chat_width = 300
    chat_frame = np.ones((frame.shape[0], chat_width, 3), dtype=np.uint8) * 220  

    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(img_rgb)

    if results.multi_hand_landmarks:
        landmarks = []
        x_coords = []
        y_coords = []

        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            for lm in hand_landmarks.landmark:
                landmarks.append([lm.x, lm.y, lm.z])
                x_coords.append(lm.x)
                y_coords.append(lm.y)

        x_min = int(min(x_coords) * frame.shape[1])
        x_max = int(max(x_coords) * frame.shape[1])
        y_min = int(min(y_coords) * frame.shape[0])
        y_max = int(max(y_coords) * frame.shape[0])
        buffer = 20
        x_min = max(0, x_min - buffer)
        x_max = min(frame.shape[1], x_max + buffer)
        y_min = max(0, y_min - buffer)
        y_max = min(frame.shape[0], y_max + buffer)
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

        landmarks = np.array(landmarks).flatten()[:63].reshape(1, -1)
        gesture = model.predict(landmarks)[0]
        confidence = max(model.predict_proba(landmarks)[0])

        if gesture in double_hand_labels and len(results.multi_hand_landmarks) == 2:
            label = f'{gesture} ({confidence * 100:.2f}%)' if confidence >= 0.5 else 'Not Detected'
        elif gesture not in double_hand_labels:
            label = f'{gesture} ({confidence * 100:.2f}%)' if confidence >= 0.5 else 'Not Detected'
        else:
            label = 'Not Detected'


        current_time = time.time()
        if confidence >= 0.5 and  label != 'Not Detected' and (current_time - last_recognition_time > 1):
            x_min, y_min = 10, 30
            cv2.putText(frame, label, (x_min, y_min), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2, cv2.LINE_AA)
            if label and (gesture != last_gesture):
                gesture_state_machine.transition(gesture)
                last_gesture = gesture
                last_recognition_time = current_time
    else:
        last_gesture = None

    recognized_gestures = gesture_state_machine.get_recognized_gestures()
    for i, text in enumerate(recognized_gestures[-10:]):
        y_pos = 30 + i * 30
        cv2.putText(chat_frame, text, (10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2, cv2.LINE_AA)

    combined_frame = np.hstack((frame, chat_frame))
    cv2.imshow('Live Hand Gesture Recognition', combined_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

I0000 00:00:1735481513.101360  125244 gl_context.cc:357] GL version: 2.1 (2.1 INTEL-18.8.16), renderer: Intel(R) Iris(TM) Graphics 6100
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1735481513.156743  125500 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1735481513.202811  125501 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


-1