# Hoàn Thiện Tạo Dataset LSTM

**Mô tả tổng quát:**   
- **Tạo Dataset**: Extract keypoints từ video (hỗ trợ 2 tay: concat 84 features/frame), normalize, sequences N=30, lưu X/y cho LSTM.  
- **Cải tiến**: Không che mất vẽ (endpoint/landmarks trước, bbox/label sau).  
- **Chạy:** Chạy từng phần riêng (Phần 1 cho test webcam, Phần 2 cho dataset).  

**Tham số chung:**  
- N_FRAMES: 30.  
- GESTURES: 11 cử chỉ của bạn.  
- DATA_DIR: 'data/' (video theo subfolder).  

**Lưu ý:** Chạy Phần 1 trước để test detect. Sau đó Phần 2 để tạo dataset (cần video sẵn).

In [8]:
import os
import cv2
import mediapipe as mp
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import random

# Khởi tạo MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,  # Hỗ trợ 2 tay
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5
)

# Tham số
DATA_DIR = '../videotrain/'
N_FRAMES = 30
OUTPUT_DIR = 'dataset/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

GESTURES = [
    'clickchuotphai', 'clickchuottrai', 'dichuyenchuot', 'dungchuongtrinh',
    'moapp', 'phongto', 'thunho', 'vuotlen', 'vuotphai', 'vuottrai', 'vuotxuong'
]

print("Import & setup hoàn tất! Sẵn sàng cho Phần 1 (Detect) hoặc Phần 2 (Dataset).")

Import & setup hoàn tất! Sẵn sàng cho Phần 1 (Detect) hoặc Phần 2 (Dataset).


##  Tạo Dataset cho LSTM (Hỗ Trợ 2 Tay)

In [9]:
def extract_keypoints_from_frame(frame_rgb, multi_landmarks):
    """Extract keypoints cho 2 tay: Luôn concat 42*2=84 features, pad zeros nếu <2 tay."""
    all_keypoints = np.zeros(84)  # Default nếu không detect gì
    
    if not multi_landmarks:
        return all_keypoints
    
    h, w, _ = frame_rgb.shape
    tay_features = []
    
    # Luôn loop 2 tay (fixed), pad nếu thiếu
    for hand_idx in range(2):
        if hand_idx < len(multi_landmarks) and multi_landmarks[hand_idx]:
            landmarks = multi_landmarks[hand_idx]
            keypoints = []
            x_min, y_min, x_max, y_max = w, h, 0, 0
            
            # Thu thập raw keypoints (21 landmarks)
            for lm in landmarks.landmark:
                x, y = lm.x * w, lm.y * h
                keypoints.extend([x, y])
                x_min = min(x_min, x)
                y_min = min(y_min, y)
                x_max = max(x_max, x)
                y_max = max(y_max, y)
            
            # Normalize (tránh /0)
            bbox_width = max(x_max - x_min, 1)
            bbox_height = max(y_max - y_min, 1)
            center_x, center_y = (x_min + x_max) / 2, (y_min + y_max) / 2
            
            normalized = []
            for i in range(0, len(keypoints), 2):
                x_norm = (keypoints[i] - center_x) / bbox_width
                y_norm = (keypoints[i + 1] - center_y) / bbox_height
                normalized.extend([x_norm, y_norm])
            
            tay_features.extend(normalized)  # 42 features
        else:
            tay_features.extend(np.zeros(42).tolist())  # Pad zeros cho tay thiếu
    
    kp_array = np.array(tay_features)
    # Đảm bảo luôn 84 (nếu lỗi)
    if kp_array.shape[0] != 84:
        print(f"WARNING: Keypoints shape sai: {kp_array.shape} → Pad to 84")
        kp_array = np.pad(kp_array, (0, 84 - kp_array.shape[0]), 'constant')
    
    return kp_array  # Luôn (84,)

def process_video(video_path, gesture_label):
    """Process video: Extract sequences với 2 tay, debug shapes."""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Không mở được: {video_path}")
        return []
    
    sequences = []
    frame_keypoints = []
    frame_count = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(frame_rgb)
        
        keypoints = extract_keypoints_from_frame(frame_rgb, results.multi_hand_landmarks)
        frame_keypoints.append(keypoints)
        frame_count += 1
        
        # Debug shape mỗi 30 frames (tùy chọn, xóa sau)
        if frame_count % 30 == 0:
            print(f"Frame {frame_count}: keypoints shape {keypoints.shape}")
        
        if len(frame_keypoints) == N_FRAMES:
            try:
                seq_array = np.array(frame_keypoints)  # Bây giờ luôn (30,84)
                sequences.append(seq_array)
                print(f"Sequence created: shape {seq_array.shape}")  # Debug
            except ValueError as e:
                print(f"ERROR tạo sequence: {e} - Skip")
                # In shapes cuối để debug
                print("Last 5 shapes:", [kp.shape for kp in frame_keypoints[-5:]])
            frame_keypoints = []
    
    # Pad nếu dư frames (<30)
    if frame_keypoints:
        try:
            padded_frames = np.array(frame_keypoints)
            pad_len = N_FRAMES - len(padded_frames)
            padded = np.zeros((pad_len, 84))
            full_padded = np.vstack([padded_frames, padded])
            sequences.append(full_padded)
            print(f"Padded sequence: shape {full_padded.shape}")
        except ValueError as e:
            print(f"ERROR pad: {e} - Skip")
    
    cap.release()
    print(f"Video {os.path.basename(video_path)}: {len(sequences)} sequences (2 tay, 84 features)")
    return sequences

In [10]:
all_sequences = []
all_labels = []

# Debug thư mục
print(f"DATA_DIR: {DATA_DIR} (exists? {os.path.exists(DATA_DIR)})")
if os.path.exists(DATA_DIR):
    subdirs = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
    print(f"Subdirs found: {subdirs[:3]}...")  # In 3 đầu

for gesture in GESTURES:
    gesture_dir = os.path.join(DATA_DIR, gesture)
    if not os.path.exists(gesture_dir):
        print(f"Không tìm thấy: {gesture_dir} - Bỏ qua.")
        continue
    
    video_files = [f for f in os.listdir(gesture_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
    if not video_files:
        print(f"{gesture}: Không có video hợp lệ!")
        continue
    print(f"\nProcessing {gesture} ({len(video_files)} videos)")
    
    for video_file in video_files:
        video_path = os.path.join(gesture_dir, video_file)
        seqs = process_video(video_path, gesture)
        all_sequences.extend(seqs)
        all_labels.extend([gesture] * len(seqs))

# Handle nếu rỗng
if not all_sequences:
    print("ERROR: Không có sequences! Kiểm tra video/path.")
else:
    # Numpy arrays
    X = np.array(all_sequences)  # (num_seq, 30, 84)
    y_str = np.array(all_labels)

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y_str)

    # Lưu chuẩn cho LSTM
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    np.save(os.path.join(OUTPUT_DIR, 'X.npy'), X)
    np.save(os.path.join(OUTPUT_DIR, 'y.npy'), y)
    np.save(os.path.join(OUTPUT_DIR, 'label_encoder.npy'), label_encoder.classes_)

    print(f"\nDataset hoàn thiện! X: {X.shape}, y: {y.shape} (classes: {len(np.unique(y))})")

    # Thống kê classes
    from collections import Counter
    class_counts = Counter(y_str)
    print("Số sequences/gesture:", dict(class_counts))

DATA_DIR: ../videotrain/ (exists? True)
Subdirs found: ['clickchuotphai', 'clickchuottrai', 'dichuyenchuot']...

Processing clickchuotphai (4 videos)
Frame 30: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 60: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 90: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 120: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 150: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 180: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 210: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 240: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 270: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 300: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 330: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 360: keypoints shape (84,)
Sequence created: shape (30, 84)
Frame 390: keypoints shape (84,)
Sequence created: shape (30,

## Kiểm Tra 5 Samples Ngẫu Nhiên

In label + shape + mẫu keypoints (frame 0, 5 landmarks đầu của Tay1).

In [11]:
# Load (nếu chạy lại)
X = np.load(os.path.join(OUTPUT_DIR, 'X.npy'))
y = np.load(os.path.join(OUTPUT_DIR, 'y.npy'))
label_encoder = np.load(os.path.join(OUTPUT_DIR, 'label_encoder.npy'), allow_pickle=True)

def decode_label(encoded_y):
    return label_encoder[encoded_y]

num_samples = min(5, len(X))
random_indices = random.sample(range(len(X)), num_samples)

print("=== KIỂM TRA 5 SEQUENCES (2 TAY) ===")
for i, idx in enumerate(random_indices):
    seq = X[idx]
    label_name = decode_label(y[idx])
    
    print(f"\nSample {i+1} (Index {idx}):")
    print(f"- Hành động: {label_name}")
    print(f"- Shape: {seq.shape} (N={N_FRAMES}, 84 features = 2 tay)")
    print(f"- Mẫu keypoints frame 0 (Tay1, 5 landmarks đầu x,y):")
    tay1_sample = seq[0][:10].reshape(-1, 2)  # 5 lm x (x,y) của tay1
    print(tay1_sample)
    print("-" * 50)

=== KIỂM TRA 5 SEQUENCES (2 TAY) ===

Sample 1 (Index 94):
- Hành động: mochorme
- Shape: (30, 84) (N=30, 84 features = 2 tay)
- Mẫu keypoints frame 0 (Tay1, 5 landmarks đầu x,y):
[[ 0.04825818  0.5       ]
 [-0.17812611  0.41873236]
 [-0.37210771  0.29886946]
 [-0.5         0.19645368]
 [-0.41722454  0.1369428 ]]
--------------------------------------------------

Sample 2 (Index 192):
- Hành động: vuotxuong
- Shape: (30, 84) (N=30, 84 features = 2 tay)
- Mẫu keypoints frame 0 (Tay1, 5 landmarks đầu x,y):
[[ 0.25750868 -0.5       ]
 [-0.13951701 -0.43041055]
 [-0.42620288 -0.28607234]
 [-0.5        -0.10372433]
 [-0.35340836  0.01363399]]
--------------------------------------------------

Sample 3 (Index 42):
- Hành động: dichuyenchuot
- Shape: (30, 84) (N=30, 84 features = 2 tay)
- Mẫu keypoints frame 0 (Tay1, 5 landmarks đầu x,y):
[[ 0.5         0.5       ]
 [-0.06236291  0.42621233]
 [-0.40525017  0.30315741]
 [-0.5         0.18442923]
 [-0.26436787  0.16593447]]
-----------------