In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import cv2
import mediapipe as mp



In [2]:


# ------------------ Dataset ------------------
class HandSignDataset(Dataset):
    def __init__(self, root_dir, num_frames=60):
        self.samples = []
        self.labels = []
        self.label_names = sorted(os.listdir(root_dir))
        self.num_frames = num_frames

        for label_idx, label in enumerate(self.label_names):
            label_folder = os.path.join(root_dir, label)
            if not os.path.isdir(label_folder):
                continue
            for file in os.listdir(label_folder):
                if file.endswith('.npy'):
                    path = os.path.join(label_folder, file)
                    self.samples.append(path)
                    self.labels.append(label_idx)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        data = np.load(self.samples[idx])

        # Pad hoặc cắt số frame
        if data.shape[0] < self.num_frames:
            pad = np.zeros((self.num_frames - data.shape[0], 2, 21, 2))
            data = np.concatenate([data, pad], axis=0)
        else:
            data = data[:self.num_frames]

        data = data.reshape(self.num_frames, -1)  # (60, 84)
        label = self.labels[idx]
        return torch.tensor(data, dtype=torch.float32), torch.tensor(label)


In [3]:
class HandSignTransformer(nn.Module):
    def __init__(self, input_size=84, num_classes=25, hidden_dim=128, num_heads=4, num_layers=2):
        super().__init__()
        self.input_proj = nn.Linear(input_size, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dropout=0.2, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.input_proj(x)  # (B, seq, hidden)
        x = self.transformer(x)
        out = x.mean(dim=1)     # average pooling theo thời gian
        return self.fc(out)


In [4]:
# ------------------ Train function ------------------
def train_transformer(train_dir, val_dir, epochs=20, batch_size=8, lr=1e-3):
    # Dataset
    train_dataset = HandSignDataset(train_dir)
    val_dataset = HandSignDataset(val_dir)

    num_classes = len(train_dataset.label_names)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Model (bạn cần định nghĩa HandSignTransformer trước)
    model = HandSignTransformer(num_classes=num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    # Train loop
    for epoch in range(epochs):
        # ---- TRAIN ----
        model.train()
        total_loss, correct, total = 0, 0, 0
        for x, y in train_loader:
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (out.argmax(1) == y).sum().item()
            total += y.size(0)
        train_acc = 100 * correct / total

        # ---- VALIDATION ----
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for x, y in val_loader:
                out = model(x)
                loss = criterion(out, y)
                val_loss += loss.item()
                val_correct += (out.argmax(1) == y).sum().item()
                val_total += y.size(0)
        val_acc = 100 * val_correct / val_total

        print(f"Epoch [{epoch+1}/{epochs}] "
              f"| Train Loss: {total_loss:.4f} | Train Acc: {train_acc:.2f}% "
              f"| Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")

    # Lưu model
    torch.save({
        'model_state': model.state_dict(),
        'labels': train_dataset.label_names
    }, "hand_transformer.pth")
    print(" Model saved as hand_transformer.pth")


In [5]:
import cv2
import numpy as np
import mediapipe as mp
import torch

mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_drawing_pose = mp.solutions.drawing_utils


# --- Hàm lấy cổ và khoảng cách vai để chuẩn hoá ---
def get_neck_point(frame):
    pose = mp_pose.Pose(static_image_mode=False,
                        min_detection_confidence=0.6,
                        min_tracking_confidence=0.5)

    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = pose.process(image)
    pose.close()

    if not results.pose_landmarks:
        return np.array([0, 0]), 1.0

    keypoints = np.array([[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark])
    shoulder_right = keypoints[11][:2]
    shoulder_left = keypoints[12][:2]
    neck = (shoulder_left + shoulder_right) / 2
    shoulder_dist = np.linalg.norm(shoulder_left - shoulder_right)
    return neck, shoulder_dist if shoulder_dist > 0 else 1.0


# --- Chuẩn hoá keypoint ---
def normalize_keypoints(keypoint, neck_point, shoulder_dist):
    normalized = np.zeros_like(keypoint)
    for i in range(2):
        hand = keypoint[i]
        if np.all(hand == 0):
            continue
        rel = hand - neck_point
        normalized[i] = rel / shoulder_dist
    return normalized


# --- Hàm realtime ---
def test_realtime(model_path="hand_transformer.pth", seq_length=30, conf_thresh=0.8):
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    labels = checkpoint['labels']

    model = HandSignTransformer(num_classes=len(labels))
    model.load_state_dict(checkpoint['model_state'])
    model.eval()

    hands = mp_hands.Hands(max_num_hands=2,
                           min_detection_confidence=0.5,
                           min_tracking_confidence=0.5)

    cap = cv2.VideoCapture(0)
    sequence = []
    last_label = "..."

    print("Bắt đầu nhận diện realtime (nhấn Q để thoát)")

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # --- Tính cổ và vai từ frame ---
        neck_point, shoulder_dist = get_neck_point(frame)
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb)

        # --- Trích xuất keypoints bàn tay ---
        keypoints = np.zeros((2, 21, 2))
        if results.multi_hand_landmarks:
            for i, hand in enumerate(results.multi_hand_landmarks[:2]):
                for j, lm in enumerate(hand.landmark):
                    keypoints[i, j] = [lm.x, lm.y]
                mp.solutions.drawing_utils.draw_landmarks(frame, hand, mp_hands.HAND_CONNECTIONS)

        # --- Chuẩn hoá ---
        normalized_kp = normalize_keypoints(keypoints, neck_point, shoulder_dist)
        sequence.append(normalized_kp)
        if len(sequence) > seq_length:
            sequence.pop(0)

        # --- Nếu đủ frame thì predict ---
        if len(sequence) == seq_length:
            data = np.array(sequence).reshape(seq_length, -1)

            # Bỏ qua nếu tay không có (toàn 0)
            if np.sum(np.abs(data)) < 1e-4:
                label = "No gesture"
            else:
                x = torch.tensor(data, dtype=torch.float32).unsqueeze(0)
                with torch.no_grad():
                    preds = torch.softmax(model(x), dim=1)
                    conf, pred_idx = torch.max(preds, dim=1)
                    if conf.item() < conf_thresh:
                        label = "No gesture"
                    else:
                        label = labels[pred_idx.item()]
                        last_label = label

            cv2.putText(frame, f"{label}", (20, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3)

        cv2.imshow("Hand Sign Transformer", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()



In [7]:
# Train
train_transformer('D:/Semester/Semester5/DPL302/Project/data_split/train','D:/Semester/Semester5/DPL302/Project/data_split/val', epochs=20, batch_size=32)



Epoch [1/20] | Train Loss: 77.3010 | Train Acc: 50.74% | Val Loss: 8.4710 | Val Acc: 80.62%
Epoch [2/20] | Train Loss: 25.9698 | Train Acc: 86.94% | Val Loss: 3.3801 | Val Acc: 91.25%
Epoch [3/20] | Train Loss: 12.6706 | Train Acc: 92.26% | Val Loss: 2.0563 | Val Acc: 95.94%
Epoch [4/20] | Train Loss: 7.8120 | Train Acc: 96.01% | Val Loss: 1.7609 | Val Acc: 95.94%
Epoch [5/20] | Train Loss: 6.3665 | Train Acc: 96.79% | Val Loss: 1.4172 | Val Acc: 96.25%
Epoch [6/20] | Train Loss: 3.1784 | Train Acc: 98.83% | Val Loss: 1.0355 | Val Acc: 97.81%
Epoch [7/20] | Train Loss: 4.7026 | Train Acc: 96.33% | Val Loss: 1.6938 | Val Acc: 95.62%
Epoch [8/20] | Train Loss: 4.8708 | Train Acc: 97.19% | Val Loss: 1.0841 | Val Acc: 96.56%
Epoch [9/20] | Train Loss: 3.7539 | Train Acc: 97.65% | Val Loss: 0.7073 | Val Acc: 98.75%
Epoch [10/20] | Train Loss: 2.6301 | Train Acc: 98.67% | Val Loss: 0.8229 | Val Acc: 97.19%
Epoch [11/20] | Train Loss: 2.6782 | Train Acc: 98.20% | Val Loss: 0.8124 | Val Acc: 9

In [8]:
# Test real-time
test_realtime("hand_transformer.pth")

Bắt đầu nhận diện realtime (nhấn Q để thoát)
