In [10]:
import cv2
import mediapipe as mp
import numpy as np
import time, os
import json
from tensorflow.keras.utils import to_categorical   
from sklearn.model_selection import train_test_split    
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense         
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau       
def collect_and_train(gesture_name, data_path, model_path, secs_for_action=15):
    """
    사용자로부터 제스처 데이터를 수집하고 기존 제스처 파일들과 함께 모델을 학습합니다.
    """

    actions = []  
    seq_length = 30    

    # MediaPipe hands model
    mp_hands = mp.solutions.hands      
    mp_drawing = mp.solutions.drawing_utils       
    hands = mp_hands.Hands(                     
        max_num_hands=2,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5)

    cap = cv2.VideoCapture(0)

    created_time = int(time.time())             
    os.makedirs(data_path, exist_ok=True)         

    # 새로운 데이터 수집
    while cap.isOpened():
        for idx, action in enumerate([gesture_name]): 
            data = []

            ret, img = cap.read()                
            img = cv2.flip(img, 1)          

            cv2.putText(img, f'Collecting {action.upper()} action...', org=(10, 30), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)
            cv2.imshow('img', img)       
            cv2.waitKey(3000)

            start_time = time.time()      

            while time.time() - start_time < secs_for_action:   
                ret, img = cap.read()                               
                img = cv2.flip(img, 1)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)        
                result = hands.process(img)                       
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

                if result.multi_hand_landmarks is not None:      
                    for res in result.multi_hand_landmarks:       
                        joint = np.zeros((21, 4))               
                        for j, lm in enumerate(res.landmark):
                            joint[j] = [lm.x, lm.y, lm.z, lm.visibility]    

                        v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3]      
                        v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3]    
                        v = v2 - v1      
                        v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]   

                        angle = np.arccos(np.einsum('nt,nt->n',
                            v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:],
                            v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:]))         

                        angle = np.degrees(angle)         

                        angle_label = np.array([angle], dtype=np.float32)           
                        angle_label = np.append(angle_label, idx)                  

                        d = np.concatenate([joint.flatten(), angle_label])          
                        data.append(d)                                              
                        mp_drawing.draw_landmarks(img, res, mp_hands.HAND_CONNECTIONS)      

                cv2.imshow('img', img)              
                if cv2.waitKey(1) == ord('q'):
                    return

            data = np.array(data)
            print(action, data.shape)
            np.save(os.path.join(data_path, f'raw_{action}_{created_time}'), data)

            full_seq_data = []
            for seq in range(len(data) - seq_length):
                full_seq_data.append(data[seq:seq + seq_length])

            full_seq_data = np.array(full_seq_data)
            print(action, full_seq_data.shape)
            np.save(os.path.join(data_path, f'seq_{action}_{created_time}'), full_seq_data)
        break
    cap.release()
    cv2.destroyAllWindows()

    # 기존 데이터 로드 및 병합
    x_data = []
    y_data = []

    for filename in os.listdir(data_path):
        if filename.startswith('seq_') and filename.endswith('.npy'):
            action_name = filename.split('_')[1]
            if action_name not in actions:
                actions.append(action_name)

            action_index = actions.index(action_name)
            data = np.load(os.path.join(data_path, filename))
            x_data.extend(data[:, :, :-1].tolist())
            y_data.extend([action_index] * len(data))

    x_data = np.array(x_data)
    y_data = np.array(y_data)
    y_data = to_categorical(y_data, num_classes=len(actions))

    x_data = x_data.astype(np.float32)
    y_data = y_data.astype(np.float32)

    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=43)

    model = Sequential([
        LSTM(64, activation='relu', input_shape=x_train.shape[1:3]),
        Dense(32, activation='relu'),
        Dense(len(actions), activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

    history = model.fit(
        x_train,
        y_train,
        validation_data=(x_val, y_val),
        epochs=200,
        callbacks=[
            ModelCheckpoint(model_path, monitor='val_acc', verbose=1, save_best_only=True, mode='auto'),
            ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=50, verbose=1, mode='auto')
        ]
    )

    # 모델 학습 후 레이블 저장
    with open('./training/gesture_labels.json', 'w') as f:
        json.dump(actions, f)

if __name__ == "__main__":
    gesture_name = input("학습할 제스처 이름을 입력하세요: ")
    data_path = './training'
    model_path = './models/gesture_model.h5'

    collect_and_train(gesture_name, data_path, model_path)

I0000 00:00:1743649386.737998   47640 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743649386.740982   97034 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1743649386.774063   97024 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1743649386.795430   97029 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


굳 (351, 100)
굳 (321, 30, 100)
Epoch 1/200
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - acc: 0.5363 - loss: 17.6913
Epoch 1: val_acc improved from -inf to 0.98165, saving model to ./models/gesture_model.h5




[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 89ms/step - acc: 0.5401 - loss: 17.4177 - val_acc: 0.9817 - val_loss: 0.0878 - learning_rate: 0.0010
Epoch 2/200
[1m25/31[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 7ms/step - acc: 0.9830 - loss: 0.2299
Epoch 2: val_acc improved from 0.98165 to 1.00000, saving model to ./models/gesture_model.h5




[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - acc: 0.9848 - loss: 0.2028 - val_acc: 1.0000 - val_loss: 3.3580e-06 - learning_rate: 0.0010
Epoch 3/200
[1m26/31[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 7ms/step - acc: 1.0000 - loss: 1.3197e-04
Epoch 3: val_acc did not improve from 1.00000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - acc: 1.0000 - loss: 1.1704e-04 - val_acc: 1.0000 - val_loss: 4.9606e-06 - learning_rate: 0.0010
Epoch 4/200
[1m25/31[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 7ms/step - acc: 1.0000 - loss: 1.8702e-04
Epoch 4: val_acc did not improve from 1.00000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - acc: 1.0000 - loss: 1.5730e-04 - val_acc: 1.0000 - val_loss: 4.4971e-06 - learning_rate: 0.0010
Epoch 5/200
[1m25/31[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 7ms/step - acc: 1.0000 - loss: 4.6971e-05
Epoch 5: val_acc did not improve from 1.0000

In [13]:
import cv2
import mediapipe as mp
import numpy as np
from tensorflow.keras.models import load_model
import json
from PIL import Image, ImageDraw, ImageFont

def put_korean_text(img, text, position, font_size=32, color=(0, 0, 255)):
    """
    OpenCV 이미지에 한글 텍스트를 그리기 위해 PIL을 사용합니다.
    img: OpenCV BGR 이미지
    text: 출력할 텍스트 (한글 포함)
    position: 텍스트 위치 (x, y)
    font_size: 텍스트 크기
    color: 텍스트 색상 (B, G, R)
    """
    # OpenCV BGR 이미지를 RGB로 변환하고 PIL 이미지로 변경
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(img_rgb)
    draw = ImageDraw.Draw(pil_img)
    try:
        # Ubuntu 환경에서 Nanum Gothic 폰트 사용
        font = ImageFont.truetype("/usr/share/fonts/truetype/nanum/NanumGothic.ttf", font_size)
    except IOError:
        print("NanumGothic.ttf not found. Using default font.")
        font = ImageFont.load_default()
    # PIL은 색상을 RGB 순서로 사용합니다.
    draw.text(position, text, font=font, fill=(color[2], color[1], color[0]))
    # PIL 이미지를 다시 OpenCV BGR 이미지로 변환
    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
    return img

def recognize_gesture(model_path):
    """
    실시간으로 제스처를 인식하고 화면에 표시합니다.
    """
    seq_length = 30

    try:
        model = load_model(model_path)
    except FileNotFoundError:
        print(f"Error: Model file not found at {model_path}")
        return

    # 모델의 레이블(제스처 이름) 가져오기
    try:
        with open('./training/gesture_labels.json', 'r') as f:
            actions = json.load(f)
    except FileNotFoundError:
        print("Error: gesture_labels.json file not found.")
        return

    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils
    hands = mp_hands.Hands(
        max_num_hands=2,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5)

    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open camera.")
        return

    seq = []
    action_seq = []

    while cap.isOpened():
        ret, img = cap.read()
        if not ret:
            print("Frame not captured.")
            break

        img = cv2.flip(img, 1)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        result = hands.process(img_rgb)
        img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)

        if result.multi_hand_landmarks:
            for hand_landmarks in result.multi_hand_landmarks:
                joint = np.zeros((21, 4))
                for j, landmark in enumerate(hand_landmarks.landmark):
                    joint[j] = [landmark.x, landmark.y, landmark.z, landmark.visibility]

                v1 = joint[[0, 1, 2, 3, 0, 5, 6, 7, 0, 9, 10, 11, 0, 13, 14, 15, 0, 17, 18, 19], :3]
                v2 = joint[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], :3]
                v = v2 - v1
                norm = np.linalg.norm(v, axis=1)
                norm[norm == 0] = 1e-6  # 0으로 나누는 경우 방지
                v = v / norm[:, np.newaxis]

                angle = np.arccos(np.clip(np.einsum('nt,nt->n',
                    v[[0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18], :],
                    v[[1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19], :]), -1.0, 1.0))
                angle = np.degrees(angle)

                d = np.concatenate([joint.flatten(), angle])
                seq.append(d)

                mp_drawing.draw_landmarks(img_bgr, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                if len(seq) < seq_length:
                    continue

                input_data = np.expand_dims(np.array(seq[-seq_length:], dtype=np.float32), axis=0)
                y_pred = model.predict(input_data).squeeze()

                if y_pred.ndim == 0:
                    print("No prediction result.")
                    continue

                i_pred = int(np.argmax(y_pred))
                conf = y_pred[i_pred]

                if conf < 0.9:
                    continue

                predicted_action = actions[i_pred]
                action_seq.append(predicted_action)

                if len(action_seq) < 3:
                    continue

                this_action = '?'
                if action_seq[-1] == action_seq[-2] == action_seq[-3]:
                    this_action = predicted_action

                text = f'{this_action.upper()} ({conf:.2f})'
                img_bgr = put_korean_text(img_bgr, text, (10, 60), font_size=32, color=(0, 0, 255))
        else:
            img_bgr = put_korean_text(img_bgr, "손이 감지되지 않았습니다.", (10, 60), font_size=24, color=(0, 255, 0))

        cv2.imshow('img', img_bgr)
        if cv2.waitKey(30) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    model_path = './models/gesture_model.h5'
    recognize_gesture(model_path)


I0000 00:00:1743649760.944437   47640 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743649760.950415  118932 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1743649760.997593  118922 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1743649761.032179  118925 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 620ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3