In [1]:
import sys
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2
from dataclasses import dataclass, field
from typing import List, Optional
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from glob import glob
import random
from tensorflow.keras.callbacks import Callback 
import json
import torch
import torch.nn as nn
from dataclasses import dataclass
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from concurrent.futures import ProcessPoolExecutor, as_completed
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic

In [2]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results
# 定义需要显示的特定面部关键点编号
selected_indices = [
    419, 290, 303, 242, 56, 155, 221, 226, 387, 362, 385, 310, 295, 340, 0, 37, 39, 40, 178, 146, 90, 72, 
    448, 380, 274, 398, 87, 98, 64, 324, 222, 1, 13, 22, 159, 145, 157, 89, 
    312, 462, 259, 63, 66, 112, 461, 463, 348, 62, 308, 119, 269, 78, 16, 65, 144, 163, 
    384, 229, 84, 321, 325, 466, 403, 182, 232, 219, 141, 249, 196, 320, 95, 
    304, 77, 272, 224, 239, 268, 316, 405, 86, 186, 
    63, 296, 334, 53, 195, 66, 107, 52, 65
]

# 定义数据类来存储每个关键点的坐标
@dataclass
class Landmark:
    x: float
    y: float
    z: float

# 定义 CustomResults 数据类
@dataclass
class CustomResults:
    face_landmarks: Optional[landmark_pb2.NormalizedLandmarkList] = None
    pose_landmarks: Optional[landmark_pb2.NormalizedLandmarkList] = None
    left_hand_landmarks: Optional[landmark_pb2.NormalizedLandmarkList] = None
    right_hand_landmarks: Optional[landmark_pb2.NormalizedLandmarkList] = None

# 提取并过滤面部关键点并生成 NormalizedLandmarkList
def create_filtered_face_landmarks(landmarks, indices):
    if not landmarks:
        return None
    filtered_landmarks = [landmarks[idx] for idx in indices]
    return landmark_pb2.NormalizedLandmarkList(landmark=filtered_landmarks)
# 使用标准的 MediaPipe 绘图方式来绘制关键点（无连接线）
def draw_landmarks(image, custom_results):
    h, w, _ = image.shape
    
    # 绘制面部关键点（不包含连接线）
    if custom_results.face_landmarks:
        for idx, landmark in zip(selected_indices, custom_results.face_landmarks.landmark):
            x, y = int(landmark.x * w), int(landmark.y * h)
            cv2.circle(image, (x, y), 2, (0, 255, 0), -1)  
            cv2.putText(image, str(idx), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 1, cv2.LINE_AA)

    # 绘制姿势关键点
    if custom_results.pose_landmarks:
        mp_drawing.draw_landmarks(
            image, custom_results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(88, 22, 88), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(66, 22, 66), thickness=2, circle_radius=2)
        )

    # 绘制左手关键点
    if custom_results.left_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, custom_results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(100, 22, 200), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(100, 22, 200), thickness=2, circle_radius=2)
        )

    # 绘制右手关键点
    if custom_results.right_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, custom_results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(0, 90, 0), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(0, 90, 0), thickness=2, circle_radius=2)
        )

def extract_keypoints(custom_results):
    face = np.array([[lm.x, lm.y, lm.z] for lm in custom_results.face_landmarks.landmark]).flatten() \
        if custom_results.face_landmarks else np.full(90 * 3, np.nan)
        
    pose = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in custom_results.pose_landmarks.landmark]).flatten() \
        if custom_results.pose_landmarks else np.full(33 * 4, np.nan)
        
    lh = np.array([[lm.x, lm.y, lm.z] for lm in custom_results.left_hand_landmarks.landmark]).flatten() \
        if custom_results.left_hand_landmarks else np.full(21 * 3, np.nan)
        
    rh = np.array([[lm.x, lm.y, lm.z] for lm in custom_results.right_hand_landmarks.landmark]).flatten() \
        if custom_results.right_hand_landmarks else np.full(21 * 3, np.nan)
    
    # 合并所有关键点
    keypoints = np.concatenate([face, pose, lh, rh])
    return keypoints


In [3]:
# 定义词语和标签映射
words = ['hello',  'I or me', 'father', 'mother','see u later']
label_map = {label: num for num, label in enumerate(words)}
DATA_PATH = 'I:/Ece496/custom_data'
sequence_length = int(30 * 3.5)  # 每个视频的帧数



In [44]:
# 初始化路径
DATA_PATH = "I:\\Ece496\\custom_data"
PREPROCESS_PATH = os.path.join(DATA_PATH, "preprocess")
PROCESSED_VIDEOS_PATH = os.path.join(PREPROCESS_PATH, "processed_videos.txt")
SEQUENCE_COUNTER_PATH = os.path.join(PREPROCESS_PATH, "sequence_counter.json")

# 创建 preprocess 目录
if not os.path.exists(PREPROCESS_PATH):
    os.makedirs(PREPROCESS_PATH)

# 加载之前保存的关键点数据
if os.path.exists(os.path.join(PREPROCESS_PATH, "X.npy")) and os.path.exists(os.path.join(PREPROCESS_PATH, "y.npy")):
    X = np.load(os.path.join(PREPROCESS_PATH, "X.npy"), allow_pickle=True)
    y = np.load(os.path.join(PREPROCESS_PATH, "y.npy"), allow_pickle=True)
    sequences = list(X)  # 转为列表，以便后续可以追加新数据
    labels = list(y)  # 转为列表
else:
    sequences = []
    labels = []

# 加载已处理的视频路径
if os.path.exists(PROCESSED_VIDEOS_PATH):
    with open(PROCESSED_VIDEOS_PATH, 'r') as f:
        processed_videos = set(line.strip() for line in f)
else:
    processed_videos = set()

# 加载 sequence_counter 状态
if os.path.exists(SEQUENCE_COUNTER_PATH):
    with open(SEQUENCE_COUNTER_PATH, 'r') as f:
        sequence_counter = json.load(f)
else:
    sequence_counter = {word: 0 for word in words}  # 初始化计数器

for word in words:
    word_path = os.path.join(DATA_PATH, word)
    video_files = glob(os.path.join(word_path, f"{word}_*.mp4"))

    for video_file in video_files:
        if video_file in processed_videos:
            print(f"Skipping already processed video: {video_file}")
            continue

        # 每次处理一个新的文件，就增加该 word 的计数
        sequence_counter[word] += 1
        sequence = sequence_counter[word]  # 获取当前 word 的 sequence 编号

        # 打印当前处理的视频文件、序列编号和标签
        print(f"Processing video: {video_file}, sequence: {sequence}, label: {label_map[word]}")

        # 处理视频文件
        cap = cv2.VideoCapture(video_file)
        if not cap.isOpened():
            print(f"Cannot open video file: {video_file}")
            continue

        window = []

        with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
            frame_num = 0
            while frame_num < sequence_length:
                ret, frame = cap.read()
                if not ret:
                    print(f"Failed to read frame from {video_file}")
                    break

                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = holistic.process(frame_rgb)

                custom_results = CustomResults(
                    face_landmarks=create_filtered_face_landmarks(results.face_landmarks.landmark, selected_indices) if results.face_landmarks else None,
                    pose_landmarks=results.pose_landmarks,
                    left_hand_landmarks=results.left_hand_landmarks,
                    right_hand_landmarks=results.right_hand_landmarks
                )

                keypoints = extract_keypoints(custom_results)
                window.append(keypoints)

                frame_num += 1

        cap.release()

        if len(window) == sequence_length:
            sequences.append(window)
            labels.append(label_map[word])
            processed_videos.add(video_file)
            print(f"Added sequence for word '{word}', sequence {sequence}, label: {label_map[word]}")

            # 每次成功处理一个视频后，保存 sequence_counter 状态
            with open(SEQUENCE_COUNTER_PATH, 'w') as f:
                json.dump(sequence_counter, f)

# 保存处理过的视频路径到 txt 文件
with open(PROCESSED_VIDEOS_PATH, 'w') as f:
    for video_path in processed_videos:
        f.write(f"{video_path}\n")

# 转换 `sequences` 和 `labels` 为 NumPy 数组后保存
X = np.array(sequences)
y = np.array(labels)
np.save(os.path.join(PREPROCESS_PATH, "X.npy"), X)
np.save(os.path.join(PREPROCESS_PATH, "y.npy"), y)

print("Total sequences:", len(sequences))
print("X 的形状:", X.shape)
print("y 的形状:", y.shape)


Skipping already processed video: I:\Ece496\custom_data\hello\hello_1_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_2_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_3_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_4_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_5_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_7_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_8_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_10_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_11_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_12_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_13_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_14_Bill.mp4
Skipping already processed video: I:\Ece496\cus

In [5]:
# 假设 label_map 是一个字典，形如 {"hello": 0, "goodbye": 1, ...}
# 创建反向映射：从 label 到单词
reverse_label_map = {v: k for k, v in label_map.items()}

# 打印每个 label 对应的单词
print("Label 对应的单词如下:")
for label, word in reverse_label_map.items():
    print(f"Label {label}: {word}")


Label 对应的单词如下:
Label 0: hello
Label 1: I or me
Label 2: father
Label 3: mother
Label 4: see u later


In [6]:
y = to_categorical(labels, num_classes=len(words)).astype(int)
y

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1]])

In [7]:
print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (510, 105, 528)
y shape: (510, 5)


In [77]:
# 划分数据集
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# 确认分割后的数据形状
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)


X_train shape: (408, 105, 528)
X_val shape: (51, 105, 528)
X_test shape: (51, 105, 528)
y_train shape: (408,)
y_val shape: (51,)
y_test shape: (51,)


In [88]:
from tensorflow.keras.layers import TimeDistributed, Flatten

# 定义 LSTM 模型
model_with_velocity = Sequential()

# 展平多维特征到单个维度，使其成为 (batch_size, time_steps, features)
model_with_velocity.add(TimeDistributed(Flatten(), input_shape=(105, 176, 9)))

# 第一层 LSTM
model_with_velocity.add(LSTM(128, return_sequences=True, activation='tanh'))
model_with_velocity.add(Dropout(0.2))

# 第二层 LSTM
model_with_velocity.add(LSTM(64, return_sequences=True, activation='tanh'))
model_with_velocity.add(Dropout(0.2))

# 第三层 LSTM
model_with_velocity.add(LSTM(32, activation='tanh'))

# 全连接层
model_with_velocity.add(Dense(64))
model_with_velocity.add(LeakyReLU(alpha=0.1))
model_with_velocity.add(BatchNormalization())

model_with_velocity.add(Dense(32))
model_with_velocity.add(LeakyReLU(alpha=0.1))
model_with_velocity.add(BatchNormalization())

# 输出层
model_with_velocity.add(Dense(len(words), activation='softmax'))

# 编译模型
model_with_velocity.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 查看模型结构
model_with_velocity.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_distributed (TimeDistr  (None, 105, 1584)        0         
 ibuted)                                                         
                                                                 
 lstm_10 (LSTM)              (None, 105, 128)          877056    
                                                                 
 dropout_6 (Dropout)         (None, 105, 128)          0         
                                                                 
 lstm_11 (LSTM)              (None, 105, 64)           49408     
                                                                 
 dropout_7 (Dropout)         (None, 105, 64)           0         
                                                                 
 lstm_12 (LSTM)              (None, 32)                12416     
                                                      

In [57]:
log_dir_with_velocity = os.path.join('Logs', 'with_velocity')
tb_callback_with_velocity = TensorBoard(log_dir=log_dir_with_velocity)

# 定义 EarlyStopping 回调：当验证损失在 5 个 epoch 后没有改善时停止训练
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

# 定义 ReduceLROnPlateau 回调：如果验证损失连续 3 个 epoch 无改善则降低学习率
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# 将回调函数放入列表中
callbacks = [tb_callback_with_velocity , early_stopping, reduce_lr]

In [74]:
from asl_dataset import ASLDataset, compute_mean_std, apply_random_augmentations

# 如果增强函数需要单独使用，也可以这样导入
from asl_dataset import temporal_resample, flip_keypoints, random_keypoint_dropout


In [78]:
import torch
import torch.nn as nn
import numpy as np
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split


def ensure_sequence_shape(sequence):
    """
    确保输入的 sequence 形状为 (seq_len, n_keypoints, 3)。
    如果输入是 2D，将其重塑为 3D。
    """
    if sequence.dim() == 2 and sequence.shape[1] % 3 == 0:
        n_keypoints = sequence.shape[1] // 3
        sequence = sequence.view(sequence.shape[0], n_keypoints, 3)
    elif sequence.dim() != 3:
        raise ValueError(f"Expected sequence to have shape (seq_len, n_keypoints, 3), but got {sequence.shape}")
    return sequence
def compute_mean_std(X_train):
    """
    计算数据集 X_train 的全局均值和标准差。
    
    Args:
        X_train (list or np.ndarray): 包含所有训练样本的列表或数组。
    
    Returns:
        tuple: global_mean, global_std
    """
    all_keypoints = np.concatenate([sample for sample in X_train], axis=0)  # 将所有样本拼接在一起
    mean = np.nanmean(all_keypoints, axis=0)  # 忽略 NaN 计算均值
    std = np.nanstd(all_keypoints, axis=0)    # 忽略 NaN 计算标准差
    return mean, std
def temporal_resample(sequence, target_length=105, min_scale=0.5, max_scale=1.5):
    sequence = ensure_sequence_shape(sequence)  # 确保序列形状为 (seq_len, n_keypoints, 3)
    scale = np.random.uniform(min_scale, max_scale)
    new_length = int(sequence.shape[0] * scale)

    # 调整为 3D 输入
    n_keypoints = sequence.shape[1]
    sequence = sequence.permute(1, 0, 2).contiguous().view(-1, sequence.shape[0])  # (n_keypoints * 3, seq_len)
    
    # 重采样到 new_length
    resampled_sequence = F.interpolate(sequence.unsqueeze(0), size=new_length, mode='linear', align_corners=False).squeeze(0)
    
    # 确保重塑大小匹配，并恢复 (new_length, n_keypoints, 3) 的形状
    resampled_sequence = resampled_sequence.view(n_keypoints, new_length, 3).permute(1, 0, 2)

    # 最终调整到 target_length
    final_sequence = F.interpolate(resampled_sequence.permute(1, 2, 0).contiguous().view(-1, resampled_sequence.shape[0]).unsqueeze(0),
                                   size=target_length, mode='linear', align_corners=False).squeeze(0)
    
    # 恢复形状 (target_length, n_keypoints, 3)
    final_sequence = final_sequence.view(n_keypoints, target_length, 3).permute(1, 0, 2)
    return final_sequence

def windowed_cutmix(sequence1, sequence2, target_length=105):
    sequence1, sequence2 = ensure_sequence_shape(sequence1), ensure_sequence_shape(sequence2)
    cut_ratio = np.random.rand()
    cut_point1 = int(sequence1.shape[0] * cut_ratio)
    cut_point2 = int(sequence2.shape[0] * cut_ratio)
    mixed_sequence = torch.cat((sequence1[:cut_point1], sequence2[cut_point2:]), dim=0)

    # 调整为 3D 输入，应用 interpolate
    mixed_sequence = mixed_sequence.permute(1, 0, 2).contiguous()  # (n_keypoints, seq_len, 3)
    mixed_sequence = mixed_sequence.view(-1, mixed_sequence.shape[1])  # (n_keypoints * 3, seq_len)
    final_sequence = F.interpolate(mixed_sequence.unsqueeze(0), size=target_length, mode='linear', align_corners=False).squeeze(0)
    final_sequence = final_sequence.view(sequence1.shape[1], target_length, 3).permute(1, 0, 2)  # (target_length, n_keypoints, 3)
    return final_sequence
def temporal_shift(sequence, target_length=105, max_shift=10):
    sequence = ensure_sequence_shape(sequence)  # 确保形状为 (seq_len, n_keypoints, 3)
    shift = np.random.randint(-max_shift, max_shift)
    shifted_sequence = torch.roll(sequence, shifts=shift, dims=0)

    # 最终调整为 target_length
    seq_len = shifted_sequence.shape[0]
    if seq_len > target_length:
        shifted_sequence = F.interpolate(shifted_sequence.permute(1, 2, 0).unsqueeze(0), size=target_length, mode='linear', align_corners=False)
        shifted_sequence = shifted_sequence.squeeze(0).permute(2, 0, 1)
    elif seq_len < target_length:
        padding = torch.zeros((target_length - seq_len, shifted_sequence.shape[1], shifted_sequence.shape[2]), dtype=shifted_sequence.dtype)
        shifted_sequence = torch.cat((shifted_sequence, padding), dim=0)
    
    return shifted_sequence

def random_keypoint_dropout(sequence, num_points_to_drop=6, num_time_windows=3):
    sequence = ensure_sequence_shape(sequence)  # 确保形状为 (seq_len, n_keypoints, 3)
    seq_len, num_keypoints, _ = sequence.shape
    for _ in range(num_time_windows):
        start = np.random.randint(0, seq_len)
        end = min(seq_len, start + np.random.randint(1, seq_len // num_time_windows))
        drop_indices = np.random.choice(num_keypoints, num_points_to_drop, replace=False)
        sequence[start:end, drop_indices, :] = 0  # 将选定关键点置为 0
    return sequence

def spatial_mask(sequence, mask_prob=0.3, max_points=10):
    sequence = ensure_sequence_shape(sequence)  # 确保形状为 (seq_len, n_keypoints, 3)
    if np.random.rand() < mask_prob:
        num_keypoints = sequence.shape[1]
        mask_points = np.random.choice(num_keypoints, max_points, replace=False)
        sequence[:, mask_points, :] = 0  # 空间遮挡，将选定的关键点置为 0
    return sequence

def temporal_mask(sequence, mask_prob=0.3, max_mask_len=10):
    sequence = ensure_sequence_shape(sequence)  # 确保形状为 (seq_len, n_keypoints, 3)
    if np.random.rand() < mask_prob:
        seq_len = sequence.shape[0]
        mask_len = np.random.randint(1, max_mask_len)
        start = np.random.randint(0, seq_len - mask_len)
        sequence[start:start + mask_len, :, :] = 0  # 时间遮挡，将选定时间窗口内的关键点置为 0
    return sequence

def drop_face_or_pose(sequence, drop_face_prob=0.2, drop_pose_prob=0.2, face_indices=None, pose_indices=None):
    sequence = ensure_sequence_shape(sequence)  # 确保形状为 (seq_len, n_keypoints, 3)
    if np.random.rand() < drop_face_prob and face_indices is not None:
        sequence[:, face_indices, :] = 0  # 面部关键点置为 0
    if np.random.rand() < drop_pose_prob and pose_indices is not None:
        sequence[:, pose_indices, :] = 0  # 姿态关键点置为 0
    return sequence

def drop_hand_keypoints(sequence, drop_hand_prob=0.05, left_hand_indices=None, right_hand_indices=None):
    sequence = ensure_sequence_shape(sequence)  # 确保形状为 (seq_len, n_keypoints, 3)
    if np.random.rand() < drop_hand_prob:
        if left_hand_indices is not None:
            sequence[:, left_hand_indices, :] = 0  # 左手关键点置为 0
        if right_hand_indices is not None:
            sequence[:, right_hand_indices, :] = 0  # 右手关键点置为 0
    return sequence

def flip_keypoints(sequence, left_hand_indices, right_hand_indices):
    """
    仅对左右手的关键点进行翻转。

    Args:
        sequence (torch.Tensor): 形状为 (seq_len, n_keypoints, 3) 的关键点序列。
        left_hand_indices (list): 左手关键点索引。
        right_hand_indices (list): 右手关键点索引。
    
    Returns:
        torch.Tensor: 翻转后的关键点序列。
    """
    sequence = ensure_sequence_shape(sequence)  # 确保形状为 (seq_len, n_keypoints, 3)

    # 翻转 X 坐标
    sequence[:, :, 0] = -sequence[:, :, 0]

    # 创建副本，用于交换左右手关键点
    flipped_sequence = sequence.clone()
    for l_idx, r_idx in zip(left_hand_indices, right_hand_indices):
        flipped_sequence[:, r_idx, :] = sequence[:, l_idx, :]  # 左手 → 右手
        flipped_sequence[:, l_idx, :] = sequence[:, r_idx, :]  # 右手 → 左手

    return flipped_sequence
def compute_velocity_and_acceleration(sequence):
    """
    计算给定序列的速度和加速度。
    Args:
        sequence (torch.Tensor): 形状为 (seq_len, n_keypoints, 3) 的位置序列。
    Returns:
        torch.Tensor, torch.Tensor: 分别返回速度和加速度，形状均为 (seq_len, n_keypoints, 3)。
    """
    sequence = ensure_sequence_shape(sequence)  # 确保输入是 (seq_len, n_keypoints, 3)
    
    # 计算速度 (Δx, Δy, Δz)
    velocity = torch.diff(sequence, dim=0, prepend=sequence[:1])
    
    # 计算加速度 (Δv_x, Δv_y, Δv_z)
    acceleration = torch.diff(velocity, dim=0, prepend=velocity[:1])
    
    return velocity, acceleration


def affine_transform(sequence, scale_range=(0.9, 1.1), translation_range=(-0.1, 0.1), rotation_range=(-10, 10)):
    sequence = ensure_sequence_shape(sequence)  # 确保形状为 (seq_len, n_keypoints, 3)

    # 随机缩放
    scale = np.random.uniform(*scale_range)
    sequence = sequence * scale

    # 随机平移
    translation = np.random.uniform(*translation_range, size=(1, sequence.shape[1], sequence.shape[2]))
    sequence = sequence + torch.tensor(translation, dtype=sequence.dtype, device=sequence.device)

    # 随机旋转（围绕 Z 轴旋转）
    angle = np.radians(np.random.uniform(*rotation_range))
    rotation_matrix = torch.tensor([
        [np.cos(angle), -np.sin(angle), 0],
        [np.sin(angle),  np.cos(angle), 0],
        [0, 0, 1]
    ], dtype=sequence.dtype, device=sequence.device)
    sequence = torch.matmul(sequence, rotation_matrix)
    return sequence

# 在3D形状的 (seq_len, n_keypoints, 3) 中，定义关键点索引范围
face_indices = list(range(0, 90))            # 面部关键点在 3D 中为 0~89
pose_indices = list(range(90, 90 + 33))      # 姿态关键点在 3D 中为 90~122
left_hand_indices = list(range(123, 123 + 21))  # 左手关键点在 3D 中为 123~143
right_hand_indices = list(range(144, 144 + 21)) # 右手关键点在 3D 中为 144~164

DATA_PATH = "I:\\Ece496\\custom_data\\preprocess"
target_length = 105  # 设置目标长度

# 加载数据
X = np.load(os.path.join(DATA_PATH, "X.npy"), allow_pickle=True)
y = np.load(os.path.join(DATA_PATH, "y.npy"), allow_pickle=True)

# 划分数据集
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 保存划分后的数据
np.save(os.path.join(DATA_PATH, "X_train.npy"), X_train)
np.save(os.path.join(DATA_PATH, "y_train.npy"), y_train)
np.save(os.path.join(DATA_PATH, "X_val.npy"), X_val)
np.save(os.path.join(DATA_PATH, "y_val.npy"), y_val)
np.save(os.path.join(DATA_PATH, "X_test.npy"), X_test)
np.save(os.path.join(DATA_PATH, "y_test.npy"), y_test)

# 定义自定义数据集类
class ASLDataset(Dataset):
    def __init__(self, X, y, global_mean, global_std, augment=True):
        self.X = X
        self.y = y
        self.global_mean = global_mean
        self.global_std = global_std
        self.augment = augment

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = self.X[idx]
        label = self.y[idx]

        sequence = np.nan_to_num(sequence, nan=0.0)
        
        # 归一化和中心化
        sequence = (sequence - self.global_mean) / self.global_std
        sequence = torch.from_numpy(sequence).float()

        # 确保形状为 (seq_len, n_keypoints, 3)
        sequence = ensure_sequence_shape(sequence)

        # 应用数据增强
        if self.augment:
            sequence = apply_random_augmentations(sequence, target_length=target_length)

        return sequence, label

def apply_random_augmentations(sequence, augment_prob=0.5, resize_prob=0.8, target_length=105):
    """
    对序列数据应用增强操作，支持未展平的 3D 数据。
    sequence 形状: (seq_len, n_keypoints, feature_dim)
    """
    sequence = ensure_sequence_shape(sequence)  # 确保序列为 3D 形状 (seq_len, n_keypoints, 3)
    print(f"Initial sequence shape: {sequence.shape}")

    # 1. 初始计算速度和加速度（不添加到序列中）
    initial_velocity, initial_acceleration = compute_velocity_and_acceleration(sequence)

    # 2. 时间增强
    if np.random.rand() < augment_prob:
        sequence = temporal_resample(sequence, target_length=target_length)
        print(f"Sequence shape after temporal_resample: {sequence.shape}")
        # 重新计算速度和加速度
        velocity, acceleration = compute_velocity_and_acceleration(sequence)

    if np.random.rand() < augment_prob:
        sequence = temporal_shift(sequence)
        print(f"Sequence shape after temporal_shift: {sequence.shape}")
        # 重新计算速度和加速度
        velocity, acceleration = compute_velocity_and_acceleration(sequence)

    # 3. 空间增强
    if np.random.rand() < augment_prob:
        sequence = affine_transform(sequence)
        print(f"Sequence shape after affine_transform: {sequence.shape}")
        # 重新计算速度和加速度
        velocity, acceleration = compute_velocity_and_acceleration(sequence)

    # 4. 遮罩增强（无需重新计算速度和加速度）
    if np.random.rand() < augment_prob:
        sequence = drop_face_or_pose(sequence, face_indices=face_indices, pose_indices=pose_indices)
        print(f"Sequence shape after drop_face_or_pose: {sequence.shape}")
    if np.random.rand() < augment_prob:
        sequence = drop_hand_keypoints(sequence, left_hand_indices=left_hand_indices, right_hand_indices=right_hand_indices)
        print(f"Sequence shape after drop_hand_keypoints: {sequence.shape}")
    if np.random.rand() < augment_prob:
        sequence = spatial_mask(sequence)
        print(f"Sequence shape after spatial_mask: {sequence.shape}")
    if np.random.rand() < augment_prob:
        sequence = temporal_mask(sequence)
        print(f"Sequence shape after temporal_mask: {sequence.shape}")

    # 5. 翻转（需要重新计算速度和加速度）
    if np.random.rand() < augment_prob:
        sequence = flip_keypoints(sequence, left_hand_indices, right_hand_indices)
        print(f"Sequence shape after flip_keypoints: {sequence.shape}")
        # 重新计算速度和加速度
        velocity, acceleration = compute_velocity_and_acceleration(sequence)

    # 6. 最后将完整的速度和加速度加入序列
    if 'velocity' in locals() and 'acceleration' in locals():
        sequence = torch.cat((sequence, velocity, acceleration), dim=2)
    else:
        # 如果没有动态重新计算，使用初始速度和加速度
        sequence = torch.cat((sequence, initial_velocity, initial_acceleration), dim=2)
    
    print(f"Final sequence shape: {sequence.shape}")
    return sequence





# 计算均值和标准差用于归一化
global_mean, global_std = compute_mean_std(X_train)



# 创建数据集和数据加载器
train_dataset = ASLDataset(X_train, y_train, global_mean, global_std, augment=True)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# # 训练循环中应用增强并打印处理信息
# for epoch in range(1):  # 设定单个 epoch 测试
#     print(f"Epoch {epoch + 1}")
#     for batch_idx, (sequences, labels) in enumerate(train_loader):
#         print(f"Batch {batch_idx + 1}:")
#         for sample_idx, sequence in enumerate(sequences):
#             print(f"  Sample {sample_idx + 1} after augmentation has shape: {sequence.shape}")
#         break  # 只打印一个批次，确保代码运行后不输出过多信息
#     break


In [82]:
augmented_X_train = np.array([apply_random_augmentations(torch.from_numpy(x).float()).numpy() for x in X_train])



Initial sequence shape: torch.Size([105, 176, 3])
Sequence shape after temporal_resample: torch.Size([105, 176, 3])
Sequence shape after temporal_shift: torch.Size([105, 176, 3])
Sequence shape after drop_hand_keypoints: torch.Size([105, 176, 3])
Final sequence shape: torch.Size([105, 176, 9])
Initial sequence shape: torch.Size([105, 176, 3])
Sequence shape after temporal_shift: torch.Size([105, 176, 3])
Sequence shape after drop_face_or_pose: torch.Size([105, 176, 3])
Sequence shape after temporal_mask: torch.Size([105, 176, 3])
Final sequence shape: torch.Size([105, 176, 9])
Initial sequence shape: torch.Size([105, 176, 3])
Sequence shape after temporal_resample: torch.Size([105, 176, 3])
Sequence shape after temporal_shift: torch.Size([105, 176, 3])
Sequence shape after affine_transform: torch.Size([105, 176, 3])
Sequence shape after drop_face_or_pose: torch.Size([105, 176, 3])
Sequence shape after drop_hand_keypoints: torch.Size([105, 176, 3])
Sequence shape after flip_keypoints: t

In [64]:
class ASLAugmentationCallback(Callback):
    def __init__(self, X_train, y_train, asl_augment_function, batch_size=32, target_length=105):
        """
        自定义回调，在每个 epoch 进行 ASL 增强。

        Args:
            X_train (np.ndarray): 原始训练数据。
            y_train (np.ndarray): 原始标签数据。
            asl_augment_function (callable): ASL 增强函数。
            batch_size (int): 每个 batch 的大小。
            target_length (int): 目标序列长度。
        """
        super().__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.asl_augment_function = asl_augment_function
        self.batch_size = batch_size
        self.target_length = target_length

    def on_epoch_begin(self, epoch, logs=None):
        print(f"Epoch {epoch + 1}: Applying ASL augmentations...")
        # 对每个样本应用增强
        augmented_X_train = np.array([self.asl_augment_function(torch.from_numpy(x).float()).numpy() for x in self.X_train])
        self.augmented_X_train = augmented_X_train
        self.augmented_y_train = self.y_train

    def on_train_batch_begin(self, batch, logs=None):
        # 动态加载增强后的 batch
        start = batch * self.batch_size
        end = start + self.batch_size
        logs['batch_X'] = self.augmented_X_train[start:end]
        logs['batch_y'] = self.augmented_y_train[start:end]
class TestCallback(Callback):
    def __init__(self, model, X_test, y_test):
        super(TestCallback, self).__init__()
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
    
    def on_epoch_end(self, epoch, logs=None):
        test_loss, test_acc = self.model.evaluate(self.X_test, self.y_test, verbose=0)
        print(f'   Test loss: {test_loss}, Test accuracy: {test_acc}')

In [65]:
asl_augment_callback = ASLAugmentationCallback(
    X_train, 
    y_train, 
    apply_random_augmentations, 
    batch_size=32, 
    target_length=105
)

test_callback = TestCallback(
    model_with_velocity, 
    X_test, 
    y_test
)


In [89]:
asl_augment_callback = ASLAugmentationCallback(X_train, y_train, apply_random_augmentations)

history = model_with_velocity.fit(
    X_train,
    y_train,
    epochs=100,
    validation_data=(X_val, y_val),
    callbacks=[tb_callback_with_velocity, test_callback, asl_augment_callback]
)


Epoch 1: Applying ASL augmentations...
Initial sequence shape: torch.Size([105, 176, 3])
Sequence shape after temporal_resample: torch.Size([105, 176, 3])
Sequence shape after drop_face_or_pose: torch.Size([105, 176, 3])
Sequence shape after drop_hand_keypoints: torch.Size([105, 176, 3])
Sequence shape after flip_keypoints: torch.Size([105, 176, 3])
Final sequence shape: torch.Size([105, 176, 9])
Initial sequence shape: torch.Size([105, 176, 3])
Sequence shape after temporal_resample: torch.Size([105, 176, 3])
Sequence shape after temporal_shift: torch.Size([105, 176, 3])
Sequence shape after temporal_mask: torch.Size([105, 176, 3])
Sequence shape after flip_keypoints: torch.Size([105, 176, 3])
Final sequence shape: torch.Size([105, 176, 9])
Initial sequence shape: torch.Size([105, 176, 3])
Sequence shape after affine_transform: torch.Size([105, 176, 3])
Sequence shape after drop_face_or_pose: torch.Size([105, 176, 3])
Sequence shape after flip_keypoints: torch.Size([105, 176, 3])
Fina

ValueError: in user code:

    File "c:\ProgramData\anaconda3\envs\tf2x_clone\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\ProgramData\anaconda3\envs\tf2x_clone\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\ProgramData\anaconda3\envs\tf2x_clone\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\ProgramData\anaconda3\envs\tf2x_clone\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\ProgramData\anaconda3\envs\tf2x_clone\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\ProgramData\anaconda3\envs\tf2x_clone\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_4" is incompatible with the layer: expected shape=(None, 105, 176, 9), found shape=(None, 105, 528)


In [13]:
model_with_velocity.save_weights(r"I:\Ece496\custom_data\weight1_with_velocity.h5")

In [16]:
TEST_VIDEO_FOLDER = "I:\\Ece496\\custom_data\\realtime_test"
video_files = glob(os.path.join(TEST_VIDEO_FOLDER, "*.mp4"))

model_with_velocity.load_weights(r"I:\Ece496\custom_data\weight1_with_velocity.h5")
mp_holistic = mp.solutions.holistic

# 使用 MediaPipe 处理视频
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for video_file in video_files:
        video_name = os.path.basename(video_file)
        print(f"Processing video: {video_name}")

        cap = cv2.VideoCapture(video_file)
        sequence = []  # 存储每帧的关键点
        predicted_labels = []  # 存储每帧的预测标签

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # 转换颜色空间并处理帧
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = holistic.process(frame_rgb)  # 使用已转换的RGB帧

            # 创建 custom_results 字典，确保传入正确的参数
            # 创建 custom_results 字典，确保传入正确的参数
            # Create custom results with filtered landmarks
            custom_results = CustomResults(
                face_landmarks=create_filtered_face_landmarks(results.face_landmarks.landmark, selected_indices) if results.face_landmarks else None,
                pose_landmarks=results.pose_landmarks,
                left_hand_landmarks=results.left_hand_landmarks,
                right_hand_landmarks=results.right_hand_landmarks
            )

            # Extract keypoints and ensure correct input shape
            keypoints = extract_keypoints(custom_results)


            if keypoints is not None:
                sequence.append(keypoints)

            # 检查是否达到 105 帧
            if len(sequence) == 105:
                prediction = model_with_velocity.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
                label = np.argmax(prediction)
                predicted_labels.append(label)
                sequence = []  # 清空 sequence 以便下一段 105 帧

        cap.release()

        # 统计出现最多的标签作为最终预测结果
        if predicted_labels:
            final_prediction = max(set(predicted_labels), key=predicted_labels.count)
            print(f"Video '{video_name}' processed. Final predicted label: {final_prediction} (Action: {words[final_prediction]})\n")
        else:
            print(f"Video '{video_name}' processed. No prediction available.\n")


Processing video: I or me_1_alice.mp4
Video 'I or me_1_alice.mp4' processed. Final predicted label: 1 (Action: I or me)

Processing video: I or me_2_alice.mp4
Video 'I or me_2_alice.mp4' processed. Final predicted label: 1 (Action: I or me)

Processing video: I or me_3_alice.mp4
Video 'I or me_3_alice.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: see u later_1_alice.mp4
Video 'see u later_1_alice.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: see u later_2_alice.mp4
Video 'see u later_2_alice.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: see u later_3_alice.mp4
Video 'see u later_3_alice.mp4' processed. Final predicted label: 4 (Action: see u later)



In [15]:
import numpy as np
import cv2
import mediapipe as mp

# 初始化
model_with_velocity.load_weights(r"I:\Ece496\custom_data\weight1_with_velocity.h5")
mp_holistic = mp.solutions.holistic

# 设置滑动窗口
window_size = 105
num_keypoints = 1584  # 每帧关键点数量
sequence = np.zeros((window_size, num_keypoints), dtype=float)
required_non_zero_ratio = 0.9  # 设置至少90%的帧要非零
frame_count = 0  # 初始化帧计数器

# 打开摄像头
cap = cv2.VideoCapture(0)
predicted_labels = []
last_predictions = None

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(frame_rgb)

        custom_results = CustomResults(
            face_landmarks=create_filtered_face_landmarks(results.face_landmarks.landmark, selected_indices) if results.face_landmarks else None,
            pose_landmarks=results.pose_landmarks,
            left_hand_landmarks=results.left_hand_landmarks,
            right_hand_landmarks=results.right_hand_landmarks
        )

        keypoints = extract_keypoints(custom_results)

        if keypoints is not None:
            # 更新关键点序列
            sequence[:-1] = sequence[1:]
            sequence[-1] = keypoints
            frame_count += 1  # 更新帧计数器

            # 跳过前105帧
            if frame_count < window_size:
                print(f"Skipping frame {frame_count}")
                continue  # 继续读取下一帧，直到填满105帧

            # 检查滑动窗口中非零帧的比例
            non_zero_frames = np.count_nonzero(np.any(sequence != 0, axis=1))
            if non_zero_frames / window_size >= required_non_zero_ratio:
                print("Sequence filled: True, ready for prediction")
                prediction = model_with_velocity.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
                last_predictions = prediction
                print("Prediction:", last_predictions)
            else:
                print("Sequence filled: False")

        # 在帧上显示每个动作的预测概率
        if last_predictions is not None:
            for idx, prob in enumerate(last_predictions):
                text = f"{words[idx]}: {prob:.2f}"
                cv2.putText(frame, text, (10, 30 + idx * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2, cv2.LINE_AA)

        cv2.imshow('MediaPipe Holistic', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


Skipping frame 1
Skipping frame 2
Skipping frame 3
Skipping frame 4
Skipping frame 5
Skipping frame 6
Skipping frame 7
Skipping frame 8
Skipping frame 9
Skipping frame 10
Skipping frame 11
Skipping frame 12
Skipping frame 13
Skipping frame 14
Skipping frame 15
Skipping frame 16
Skipping frame 17
Skipping frame 18
Skipping frame 19
Skipping frame 20
Skipping frame 21
Skipping frame 22
Skipping frame 23
Skipping frame 24
Skipping frame 25
Skipping frame 26
Skipping frame 27
Skipping frame 28
Skipping frame 29
Skipping frame 30
Skipping frame 31
Skipping frame 32
Skipping frame 33
Skipping frame 34
Skipping frame 35
Skipping frame 36
Skipping frame 37
Skipping frame 38
Skipping frame 39
Skipping frame 40
Skipping frame 41
Skipping frame 42
Skipping frame 43
Skipping frame 44
Skipping frame 45
Skipping frame 46
Skipping frame 47
Skipping frame 48
Skipping frame 49
Skipping frame 50
Skipping frame 51
Skipping frame 52
Skipping frame 53
Skipping frame 54
Skipping frame 55
Skipping frame 56
S

In [None]:

model_with_velocity.load_weights(r"I:\Ece496\custom_data\weight1_with_velocity.h5")
mp_holistic = mp.solutions.holistic

# 设置滑动窗口
window_size = 105
num_keypoints = 1584  # 每帧关键点数量
sequence = np.zeros((window_size, num_keypoints), dtype=float)

# 打开摄像头
cap = cv2.VideoCapture(0)
predicted_labels = []
last_predictions = None

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(frame_rgb)

        custom_results = CustomResults(
            face_landmarks=create_filtered_face_landmarks(results.face_landmarks.landmark, selected_indices) if results.face_landmarks else None,
            pose_landmarks=results.pose_landmarks,
            left_hand_landmarks=results.left_hand_landmarks,
            right_hand_landmarks=results.right_hand_landmarks
        )

        keypoints = extract_keypoints(custom_results)
        if keypoints is not None:
            # 更新关键点序列
            sequence[:-1] = sequence[1:]  # 移动数据
            sequence[-1] = keypoints  # 添加新的关键点

            # 当序列填满时进行预测
            if np.all(sequence[0] != 0):
                prediction = model_with_velocity.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
                last_predictions = prediction  # 保存最新的预测结果

        # 在帧上显示每个动作的预测概率
        if last_predictions is not None:
            for idx, prob in enumerate(last_predictions):
                text = f"{words[idx]}: {prob:.2f}"
                cv2.putText(frame, text, (10, 30 + idx * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2, cv2.LINE_AA)

        cv2.imshow('MediaPipe Holistic', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()
