In [14]:
import cv2
import mediapipe as mp
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import time
import os

In [15]:
SINGLE_HAND_JSON_PATH = '../json/single_hand_json/single_hand_pose_landmarks_data.json'
DOUBLE_HAND_JSON_PATH = '../json/double_hand_json/double_hand_pose_landmarks_data.json'
single_hand_labels = ['hallo', 'i love u', 'peace'] # 'ich', 'bin', 'S', 'E', 'R', 'T', 'A', 'N']
double_hand_labels = ['bachelor1', 'bachelor2']
SINGLE_JOBLIB_PATH = '../joblib/single_hand_model.joblib'
DOUBLE_JOBLIB_PATH = '../joblib/double_hand_model.joblib'
JOBLIB_SAVE_PATH = '../joblib/'

In [16]:
# Überprüfe, ob die JSON-Dateien existieren
if not os.path.exists(SINGLE_HAND_JSON_PATH):
    raise FileNotFoundError(f"Die Datei {SINGLE_HAND_JSON_PATH} wurde nicht gefunden.")
if not os.path.exists(DOUBLE_HAND_JSON_PATH):
    raise FileNotFoundError(f"Die Datei {DOUBLE_HAND_JSON_PATH} wurde nicht gefunden.")

# Funktion zum Laden und Vorbereiten der Daten
def load_data(json_path, expected_length):
    with open(json_path, 'r') as f:
        dataset = json.load(f)
    
    data = dataset['data']
    X = []
    y = []
    
    for item in data:
        landmarks = item['landmarks']
        if len(landmarks) == expected_length:
            X.append(landmarks)
            y.append(item['label'])
    
    X = np.array(X)
    y = np.array(y)
    
    return X, y

# Daten für Einzelhandzeichen laden und vorbereiten
X_single, y_single = load_data(SINGLE_HAND_JSON_PATH, 72)

# Daten für Doppelhandzeichen laden und vorbereiten
X_double, y_double = load_data(DOUBLE_HAND_JSON_PATH, 135)

# Überprüfe, ob Daten geladen wurden
print(f"Anzahl der Einzelhanddaten: {len(X_single)}")
print(f"Anzahl der Doppelhanddaten: {len(X_double)}")

if X_single.size == 0 or y_single.size == 0:
    raise ValueError("Keine Daten für Einzelhandzeichen gefunden.")
if X_double.size == 0 or y_double.size == 0:
    raise ValueError("Keine Daten für Doppelhandzeichen gefunden.")

# Daten in Trainings- und Testset aufteilen
X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y_single, test_size=0.2, random_state=42)
X_train_double, X_test_double, y_train_double, y_test_double = train_test_split(X_double, y_double, test_size=0.2, random_state=42)

# Modelle trainieren
model_single = RandomForestClassifier()
model_single.fit(X_train_single, y_train_single)

model_double = RandomForestClassifier()
model_double.fit(X_train_double, y_train_double)

# Modelle bewerten
y_pred_single = model_single.predict(X_test_single)
accuracy_single = accuracy_score(y_test_single, y_pred_single)
precision_single = precision_score(y_test_single, y_pred_single, average='weighted')
recall_single = recall_score(y_test_single, y_pred_single, average='weighted')
f1_single = f1_score(y_test_single, y_pred_single, average='weighted')

print(f"Genauigkeit (Einzelhandzeichen): {accuracy_single * 100:.2f}%")
print(f"Präzision (Einzelhandzeichen): {precision_single * 100:.2f}%")
print(f"Recall (Einzelhandzeichen): {recall_single * 100:.2f}%")
print(f"F1-Score (Einzelhandzeichen): {f1_single * 100:.2f}%")

y_pred_double = model_double.predict(X_test_double)
accuracy_double = accuracy_score(y_test_double, y_pred_double)
precision_double = precision_score(y_test_double, y_pred_double, average='weighted')
recall_double = recall_score(y_test_double, y_pred_double, average='weighted')
f1_double = f1_score(y_test_double, y_pred_double, average='weighted')

print(f"Genauigkeit (Doppelhandzeichen): {accuracy_double * 100:.2f}%")
print(f"Präzision (Doppelhandzeichen): {precision_double * 100:.2f}%")
print(f"Recall (Doppelhandzeichen): {recall_double * 100:.2f}%")
print(f"F1-Score (Doppelhandzeichen): {f1_double * 100:.2f}%")

# Modelle speichern
joblib.dump(model_single, JOBLIB_SAVE_PATH + 'single_hand_model.joblib')
joblib.dump(model_double, JOBLIB_SAVE_PATH + 'double_hand_model.joblib')

Anzahl der Einzelhanddaten: 1800
Anzahl der Doppelhanddaten: 600
Genauigkeit (Einzelhandzeichen): 98.89%
Präzision (Einzelhandzeichen): 98.90%
Recall (Einzelhandzeichen): 98.89%
F1-Score (Einzelhandzeichen): 98.89%
Genauigkeit (Doppelhandzeichen): 93.33%
Präzision (Doppelhandzeichen): 93.37%
Recall (Doppelhandzeichen): 93.33%
F1-Score (Doppelhandzeichen): 93.33%


['../joblib/double_hand_model.joblib']

In [None]:
# Modelle laden
model_single = joblib.load(SINGLE_JOBLIB_PATH)
model_double = joblib.load(DOUBLE_JOBLIB_PATH)

# Initialisiere Mediapipe für Hände und Pose
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)

# Kamera-Stream
cap = cv2.VideoCapture(0)

# Labels für Doppelhandzeichen und Einzelhandzeichen
double_hand_labels = ['bachelor1', 'bachelor2']
single_hand_labels = ['hallo','i love u', 'peace'] # 'ich', 'bin', 'S', 'E', 'R', 'T', 'A', 'N']
recognized_gestures = []
last_gesture = None
last_recognition_time = 0  # Zeit der letzten Erkennung

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Grauen Chat-Kasten auf der rechten Seite erstellen
    chat_width = 300
    chat_frame = np.ones((frame.shape[0], chat_width, 3), dtype=np.uint8) * 220  # Grauer Hintergrund

    # Originales Kamerabild in RGB konvertieren
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Pose- und Handlandmarks verarbeiten
    pose_results = pose.process(img_rgb)
    hand_results = hands.process(img_rgb)

    # Initialisiere Array für kombinierte Landmarks
    combined_landmarks = []

    # Pose-Landmarks (z.B. linke Schulter, rechte Schulter, Nase) extrahieren
    pose_landmarks = ["LEFT_SHOULDER", "RIGHT_SHOULDER", "NOSE"]
    if pose_results.pose_landmarks:
        for name in pose_landmarks:
            landmark = getattr(mp_pose.PoseLandmark, name)
            coord = pose_results.pose_landmarks.landmark[landmark]
            combined_landmarks.extend([coord.x, coord.y, coord.z])
    else:
        combined_landmarks.extend([0, 0, 0] * len(pose_landmarks))  # Falls keine Pose-Landmarks vorhanden sind

    # Handlandmarks extrahieren
    if hand_results.multi_hand_landmarks:
        x_coords = []
        y_coords = []
        for hand_landmarks in hand_results.multi_hand_landmarks:
            x_coords.extend([lm.x for lm in hand_landmarks.landmark])
            y_coords.extend([lm.y for lm in hand_landmarks.landmark])
            for lm in hand_landmarks.landmark:
                combined_landmarks.extend([lm.x, lm.y, lm.z])
        num_hands = len(hand_results.multi_hand_landmarks)
        
        # Rechteck um beide Hände zeichnen
        if num_hands > 0:
            x_min = int(min(x_coords) * frame.shape[1])
            x_max = int(max(x_coords) * frame.shape[1])
            y_min = int(min(y_coords) * frame.shape[0])
            y_max = int(max(y_coords) * frame.shape[0])
            buffer = 20
            x_min = max(0, x_min - buffer)
            x_max = min(frame.shape[1], x_max + buffer)
            y_min = max(0, y_min - buffer)
            y_max = min(frame.shape[0], y_max + buffer)
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
    else:
        combined_landmarks.extend([0, 0, 0] * 21)  # Falls keine Hand erkannt wird
        num_hands = 0

    # Überprüfen, ob die kombinierte Landmark-Anzahl korrekt ist (72 oder 144 Features)
    if (num_hands == 1 and len(combined_landmarks) == 72) or (num_hands == 2 and len(combined_landmarks) == 144):
        landmarks = np.array(combined_landmarks).reshape(1, -1)

        # Vorhersage nur, wenn seit der letzten Erkennung 2 Sekunden vergangen sind
        current_time = time.time()
        if current_time - last_recognition_time > 1:  # 2-Sekunden-Pause

            # Unterscheide zwischen Single- und Doppelhandzeichen basierend auf der Anzahl der erkannten Hände
            if num_hands == 2:
                # Vorhersage mit dem Doppelhand-Modell
                gesture = model_double.predict(landmarks)[0]
                confidence = max(model_double.predict_proba(landmarks)[0])
                if gesture in double_hand_labels:
                    label = f'{gesture} ({confidence * 100:.2f}%)' if confidence >= 0.5 else 'Not Detected'
                else:
                    label = 'Not Detected'
            else:
                # Vorhersage mit dem Einzelhand-Modell
                gesture = model_single.predict(landmarks)[0]
                confidence = max(model_single.predict_proba(landmarks)[0])
                if gesture in single_hand_labels:
                    label = f'{gesture} ({confidence * 100:.2f}%)' if confidence >= 0.5 else 'Not Detected'
                else:
                    label = 'Not Detected'

            # Update letzte Erkennung
            if confidence >= 0.5 and label != 'Not Detected':
                x_min, y_min = 10, 30
                cv2.putText(frame, label, (x_min, y_min), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2, cv2.LINE_AA)
                if label and (gesture != last_gesture):
                    if gesture == 'ich' and (not recognized_gestures or recognized_gestures[-1] != 'ich'):
                        recognized_gestures.append(gesture)
                    elif gesture == 'bin' and recognized_gestures and recognized_gestures[-1] == 'ich':
                        recognized_gestures[-1] = 'ich bin'
                    elif gesture != 'bin' and recognized_gestures and recognized_gestures[-1] == 'ich':
                        recognized_gestures[-1] = 'meine'
                        recognized_gestures.append(gesture)
                    else:
                        recognized_gestures.append(gesture)
                    last_gesture = gesture
                    last_recognition_time = current_time  # Aktualisiere die Zeit der letzten Erkennung
    else:
        last_gesture = None

    # Handzeichen im Chat-Kasten anzeigen
    for i, text in enumerate(recognized_gestures[-10:]):  # Nur die letzten 10 Einträge anzeigen
        y_pos = 30 + i * 30
        cv2.putText(chat_frame, text, (10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2, cv2.LINE_AA)

    # Kamerabild und Chat-Kasten nebeneinander anzeigen
    combined_frame = np.hstack((frame, chat_frame))

    # Zeige das Bild mit dem Handzeichen und der Chat-Anzeige an
    cv2.imshow('Live Hand Gesture Recognition', combined_frame)

    # Beenden mit 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)

I0000 00:00:1731254790.864073 1623704 gl_context.cc:357] GL version: 2.1 (2.1 INTEL-18.8.16), renderer: Intel(R) Iris(TM) Graphics 6100
W0000 00:00:1731254790.933538 1632697 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1731254790.978739 1632697 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1731254791.033744 1623704 gl_context.cc:357] GL version: 2.1 (2.1 INTEL-18.8.16), renderer: Intel(R) Iris(TM) Graphics 6100
W0000 00:00:1731254791.772806 1632703 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1731254791.876990 1632703 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


-1