In [6]:
import cv2
import mediapipe as mp
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import time

In [14]:
JSON_PATH = '../images/hand_pose_landmarks_data.json'

In [15]:
with open(JSON_PATH, 'r') as f:
    dataset = json.load(f)

# Extrahiere die Daten aus dem JSON
data = dataset['data']

X = []
y = []

# Alle Landmarks aus dem kombinierten Array extrahieren
for item in data:
    # Kombinierte Landmarks (Hand + Pose) direkt verwenden
    landmarks = item['landmarks']
    
    # Stelle sicher, dass alle Einträge die gleiche Länge haben
    if len(landmarks) != 72:  # Hier 72, wenn du 63 Handlandmarks + 9 Pose-Landmarks hast
        continue  # Skipping any inconsistent entries

    X.append(landmarks)
    y.append(item['label'])

# Umwandlung in NumPy-Arrays für das Training
X = np.array(X)
y = np.array(y)

# Überprüfe die Form von X
print("Shape of X:", X.shape)  # Sollte (n_samples, n_features) sein

# Daten in Trainings- und Testset aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modell trainieren
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Modell bewerten
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Genauigkeit: {accuracy * 100:.2f}%")
print(f"Präzision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")

# Modell speichern
joblib.dump(model, 'hand_gesture_model.joblib')

Shape of X: (5400, 72)
Genauigkeit: 99.35%
Präzision: 99.37%
Recall: 99.35%
F1-Score: 99.36%


['hand_gesture_model.joblib']

In [14]:
model = joblib.load('hand_gesture_model.joblib')

# Initialisiere Mediapipe für Hände und Pose
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)

# Kamera-Stream
cap = cv2.VideoCapture(0)

# Labels für Doppelhandzeichen und Einzelhandzeichen
double_hand_labels = ['bachelor1', 'bachelor2']
single_hand_labels = ['hallo', 'ich', 'bin', 'S', 'E', 'R', 'T', 'A', 'N']
recognized_gestures = []
last_gesture = None
last_recognition_time = 0  # Zeit der letzten Erkennung

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Grauen Chat-Kasten auf der rechten Seite erstellen
    chat_width = 300
    chat_frame = np.ones((frame.shape[0], chat_width, 3), dtype=np.uint8) * 220  # Grauer Hintergrund

    # Originales Kamerabild in RGB konvertieren
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Pose- und Handlandmarks verarbeiten
    pose_results = pose.process(img_rgb)
    hand_results = hands.process(img_rgb)

    # Initialisiere Array für kombinierte Landmarks
    combined_landmarks = []

    # Pose-Landmarks (z.B. linke Schulter, rechte Schulter, Nase) extrahieren
    pose_landmarks = ["LEFT_SHOULDER", "RIGHT_SHOULDER", "NOSE"]
    if pose_results.pose_landmarks:
        for name in pose_landmarks:
            landmark = getattr(mp_pose.PoseLandmark, name)
            coord = pose_results.pose_landmarks.landmark[landmark]
            combined_landmarks.extend([coord.x, coord.y, coord.z])
    else:
        combined_landmarks.extend([0, 0, 0] * len(pose_landmarks))  # Falls keine Pose-Landmarks vorhanden sind

    # Handlandmarks extrahieren
    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            x_coords = [lm.x for lm in hand_landmarks.landmark]
            y_coords = [lm.y for lm in hand_landmarks.landmark]
            for lm in hand_landmarks.landmark:
                combined_landmarks.extend([lm.x, lm.y, lm.z])
            # Rechteck um die Hand zeichnen
            x_min = int(min(x_coords) * frame.shape[1])
            x_max = int(max(x_coords) * frame.shape[1])
            y_min = int(min(y_coords) * frame.shape[0])
            y_max = int(max(y_coords) * frame.shape[0])
            buffer = 20
            x_min = max(0, x_min - buffer)
            x_max = min(frame.shape[1], x_max + buffer)
            y_min = max(0, y_min - buffer)
            y_max = min(frame.shape[0], y_max + buffer)
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            break  # Nur die erste Hand verwenden
    else:
        combined_landmarks.extend([0, 0, 0] * 21)  # Falls keine Hand erkannt wird

    # Überprüfen, ob die kombinierte Landmark-Anzahl korrekt ist (72 Features)
    if len(combined_landmarks) == 72:
        landmarks = np.array(combined_landmarks).reshape(1, -1)

        # Vorhersage nur, wenn seit der letzten Erkennung 2 Sekunden vergangen sind
        current_time = time.time()
        if current_time - last_recognition_time > 2:  # 2-Sekunden-Pause

            # Vorhersage mit dem Modell
            gesture = model.predict(landmarks)[0]
            confidence = max(model.predict_proba(landmarks)[0])

            # Unterscheide zwischen Single- und Doppelhandzeichen basierend auf der Anzahl der erkannten Hände
            if hand_results.multi_hand_landmarks is not None and len(hand_results.multi_hand_landmarks) == 2:
                # Wenn zwei Hände erkannt wurden, prüfe, ob das Label zu double_hand_labels gehört
                if gesture in double_hand_labels:
                    label = f'{gesture} ({confidence * 100:.2f}%)' if confidence >= 0.5 else 'Not Detected'
                else:
                    label = 'Not Detected'
            else:
                # Wenn nur eine Hand erkannt wurde, prüfe, ob das Label zu single_hand_labels gehört
                if gesture in single_hand_labels:
                    label = f'{gesture} ({confidence * 100:.2f}%)' if confidence >= 0.5 else 'Not Detected'
                else:
                    label = 'Not Detected'

            # Update letzte Erkennung
            if confidence >= 0.5 and label != 'Not Detected':
                x_min, y_min = 10, 30
                cv2.putText(frame, label, (x_min, y_min), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2, cv2.LINE_AA)
                if label and (gesture != last_gesture):
                    if gesture == 'ich' and (not recognized_gestures or recognized_gestures[-1] != 'ich'):
                        recognized_gestures.append(gesture)
                    elif gesture == 'bin' and recognized_gestures and recognized_gestures[-1] == 'ich':
                        recognized_gestures[-1] = 'ich bin'
                    elif gesture != 'bin' and recognized_gestures and recognized_gestures[-1] == 'ich':
                        recognized_gestures[-1] = 'meine'
                        recognized_gestures.append(gesture)
                    else:
                        recognized_gestures.append(gesture)
                    last_gesture = gesture
                    last_recognition_time = current_time  # Aktualisiere die Zeit der letzten Erkennung
    else:
        last_gesture = None

    # Handzeichen im Chat-Kasten anzeigen
    for i, text in enumerate(recognized_gestures[-10:]):  # Nur die letzten 10 Einträge anzeigen
        y_pos = 30 + i * 30
        cv2.putText(chat_frame, text, (10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2, cv2.LINE_AA)

    # Kamerabild und Chat-Kasten nebeneinander anzeigen
    combined_frame = np.hstack((frame, chat_frame))

    # Zeige das Bild mit dem Handzeichen und der Chat-Anzeige an
    cv2.imshow('Live Hand Gesture Recognition', combined_frame)

    # Beenden mit 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

I0000 00:00:1731068230.273095 1213938 gl_context.cc:357] GL version: 2.1 (2.1 INTEL-18.8.16), renderer: Intel(R) Iris(TM) Graphics 6100
W0000 00:00:1731068230.344782 1365299 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1731068230.358310 1213938 gl_context.cc:357] GL version: 2.1 (2.1 INTEL-18.8.16), renderer: Intel(R) Iris(TM) Graphics 6100
W0000 00:00:1731068230.417361 1365297 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1731068231.273908 1365303 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1731068231.343381 1365301 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


-1