In [1]:
import sys
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2
from dataclasses import dataclass, field
from typing import List, Optional
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from glob import glob
import random
from tensorflow.keras.callbacks import Callback 
import json
from dataclasses import dataclass
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from concurrent.futures import ProcessPoolExecutor, as_completed
from torch.utils.data import Dataset, DataLoader
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic

In [2]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results
# 定义需要显示的特定面部关键点编号
selected_indices = [
    419, 290, 303, 242, 56, 155, 221, 226, 387, 362, 385, 310, 295, 340, 0, 37, 39, 40, 178, 146, 90, 72, 
    448, 380, 274, 398, 87, 98, 64, 324, 222, 1, 13, 22, 159, 145, 157, 89, 
    312, 462, 259, 63, 66, 112, 461, 463, 348, 62, 308, 119, 269, 78, 16, 65, 144, 163, 
    384, 229, 84, 321, 325, 466, 403, 182, 232, 219, 141, 249, 196, 320, 95, 
    304, 77, 272, 224, 239, 268, 316, 405, 86, 186, 
    63, 296, 334, 53, 195, 66, 107, 52, 65
]

# 定义数据类来存储每个关键点的坐标
@dataclass
class Landmark:
    x: float
    y: float
    z: float

# 定义 CustomResults 数据类
@dataclass
class CustomResults:
    face_landmarks: Optional[landmark_pb2.NormalizedLandmarkList] = None
    pose_landmarks: Optional[landmark_pb2.NormalizedLandmarkList] = None
    left_hand_landmarks: Optional[landmark_pb2.NormalizedLandmarkList] = None
    right_hand_landmarks: Optional[landmark_pb2.NormalizedLandmarkList] = None

# 提取并过滤面部关键点并生成 NormalizedLandmarkList
def create_filtered_face_landmarks(landmarks, indices):
    if not landmarks:
        return None
    filtered_landmarks = [landmarks[idx] for idx in indices]
    return landmark_pb2.NormalizedLandmarkList(landmark=filtered_landmarks)
# 使用标准的 MediaPipe 绘图方式来绘制关键点（无连接线）
def draw_landmarks(image, custom_results):
    h, w, _ = image.shape
    
    # 绘制面部关键点（不包含连接线）
    if custom_results.face_landmarks:
        for idx, landmark in zip(selected_indices, custom_results.face_landmarks.landmark):
            x, y = int(landmark.x * w), int(landmark.y * h)
            cv2.circle(image, (x, y), 2, (0, 255, 0), -1)  
            cv2.putText(image, str(idx), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 1, cv2.LINE_AA)

    # 绘制姿势关键点
    if custom_results.pose_landmarks:
        mp_drawing.draw_landmarks(
            image, custom_results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(88, 22, 88), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(66, 22, 66), thickness=2, circle_radius=2)
        )

    # 绘制左手关键点
    if custom_results.left_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, custom_results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(100, 22, 200), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(100, 22, 200), thickness=2, circle_radius=2)
        )

    # 绘制右手关键点
    if custom_results.right_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, custom_results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(0, 90, 0), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(0, 90, 0), thickness=2, circle_radius=2)
        )

def calculate_velocity_acceleration(data):
    """
    计算每个帧的速度和加速度。
    
    参数：
    - data: 形状为 (num_samples, sequence_length, num_keypoints) 的 3D 数组

    返回：
    - 带有速度和加速度特征的新数据，形状为 (num_samples, sequence_length, num_keypoints * 3)
    """
    # 计算速度
    velocity = np.diff(data, axis=1)  # 对每个时间步进行差分，得到速度
    velocity = np.concatenate([velocity[:, :1, :], velocity], axis=1)  # 保持与原始帧数量一致
    
    # 计算加速度
    acceleration = np.diff(velocity, axis=1)  # 对速度再进行差分，得到加速度
    acceleration = np.concatenate([acceleration[:, :1, :], acceleration], axis=1)  # 保持与原始帧数量一致
    
    return velocity, acceleration
def calculate_relative_change(features):
    """
    计算每一帧相对于前一帧的相对变化量。
    
    参数：
    - features: 形状为 (num_samples, sequence_length, num_keypoints) 的 3D 数组

    返回：
    - 相对变化的特征
    """
    # 对每一帧计算相对变化
    relative_change = np.diff(features, axis=1)  # 计算相邻帧的差异
    relative_change = np.concatenate([relative_change[:, :1, :], relative_change], axis=1)  # 保持形状一致

    return relative_change

def extract_keypoints(custom_results):
    # 提取位置关键点 (face, pose, left_hand, right_hand)
    face = np.array([[lm.x, lm.y, lm.z] for lm in custom_results.face_landmarks.landmark]).flatten() \
        if custom_results.face_landmarks else np.zeros(90 * 3)
    pose = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in custom_results.pose_landmarks.landmark]).flatten() \
        if custom_results.pose_landmarks else np.zeros(33 * 4)
    lh = np.array([[lm.x, lm.y, lm.z] for lm in custom_results.left_hand_landmarks.landmark]).flatten() \
        if custom_results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[lm.x, lm.y, lm.z] for lm in custom_results.right_hand_landmarks.landmark]).flatten() \
        if custom_results.right_hand_landmarks else np.zeros(21 * 3)

    # 位置数据合并
    position_features = np.concatenate([face, pose, lh, rh])

    # 中心化位置特征，以第 85 个关键点（例如身体中心点）为中心
    def centralize_keypoints(keypoints, center_idx=85):
        center_x = keypoints[center_idx * 3]
        center_y = keypoints[center_idx * 3 + 1]
        center_z = keypoints[center_idx * 3 + 2]
        
        keypoints[::3] -= center_x  # X 坐标中心化
        keypoints[1::3] -= center_y  # Y 坐标中心化
        keypoints[2::3] -= center_z  # Z 坐标中心化
        return keypoints

    position_features_centered = centralize_keypoints(position_features)

    # 计算速度和加速度（不进行归一化）
    velocity, acceleration = calculate_velocity_acceleration(position_features_centered.reshape(1, -1, 3))

    # 计算相对变化
    relative_velocity = calculate_relative_change(velocity)
    relative_acceleration = calculate_relative_change(acceleration)

    # 拼接位置、速度和加速度特征
    all_features = np.concatenate([
        position_features_centered,
        relative_velocity.flatten(),
        relative_acceleration.flatten()
    ])

    return all_features
# 在 extract_keypoints 函数之后添加

def time_resample(sequence, factor):
    """时间重采样"""
    target_length = int(len(sequence) * factor)
    # 使用线性插值进行重采样
    indices = np.linspace(0, len(sequence)-1, target_length)
    indices = np.round(indices).astype(int)
    return sequence[indices]

def spatial_transform(sequence):
    """空间变换"""
    # 随机旋转角度 (-15, 15)
    angle = random.uniform(-15, 15)
    # 随机缩放因子 (0.9, 1.1)
    scale = random.uniform(0.9, 1.1)
    
    transformed = sequence.copy()
    # 应用变换
    # 注意：需要保持关键点之间的相对关系
    return transformed

def random_mask(sequence):
    """随机掩码"""
    masked = sequence.copy()
    # 随机选择时间窗口
    t_start = random.randint(0, len(sequence)-3)
    t_len = random.randint(2, 4)
    # 随机选择特征点
    feature_indices = random.sample(range(sequence.shape[1]), 
                                  k=random.randint(2, 6))
    # 应用掩码
    masked[t_start:t_start+t_len, feature_indices] = 0
    return masked

In [3]:
# 定义词语和标签映射
words = ['hello',  'I or me', 'father', 'mother','see u later']
label_map = {label: num for num, label in enumerate(words)}
DATA_PATH = 'I:/Ece496/custom_data'
sequence_length = int(30 * 3.5)  # 每个视频的帧数



In [4]:
# 初始化路径
DATA_PATH = "I:\\Ece496\\custom_data"
PROCESSED_VIDEOS_PATH = os.path.join(DATA_PATH, "processed_videos_velocity.txt")
SEQUENCE_COUNTER_PATH = os.path.join(DATA_PATH, "sequence_counter_velocity.json")

# 加载之前保存的关键点数据
if os.path.exists(os.path.join(DATA_PATH, "X_velocity.npy")) and os.path.exists(os.path.join(DATA_PATH, "y_velocity.npy")):
    X = np.load(os.path.join(DATA_PATH, "X_velocity.npy"))
    y = np.load(os.path.join(DATA_PATH, "y_velocity.npy"))
    sequences = list(X)  # 转为列表，以便后续可以追加新数据
    labels = list(y)  # 转为列表
else:
    sequences = []
    labels = []

# 加载已处理的视频路径
if os.path.exists(PROCESSED_VIDEOS_PATH):
    with open(PROCESSED_VIDEOS_PATH, 'r') as f:
        processed_videos = set(line.strip() for line in f)
else:
    processed_videos = set()
    
# 加载 sequence_counter 状态
if os.path.exists(SEQUENCE_COUNTER_PATH):
    with open(SEQUENCE_COUNTER_PATH, 'r') as f:
        sequence_counter = json.load(f)
else:
    sequence_counter = {word: 0 for word in words}  # 初始化计数器

for word in words:
    word_path = os.path.join(DATA_PATH, word)
    video_files = glob(os.path.join(word_path, f"{word}_*.mp4"))
    
    for video_file in video_files:
        if video_file in processed_videos:
            print(f"Skipping already processed video: {video_file}")
            continue

        # 每次处理一个新的文件，就增加该 word 的计数
        sequence_counter[word] += 1
        sequence = sequence_counter[word]  # 获取当前 word 的 sequence 编号

        # 打印当前处理的视频文件、序列编号和标签
        print(f"Processing video: {video_file}, sequence: {sequence}, label: {label_map[word]}")

        # 处理视频文件
        cap = cv2.VideoCapture(video_file)
        if not cap.isOpened():
            print(f"Cannot open video file: {video_file}")
            continue
        
        window = []

        with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
            frame_num = 0
            while frame_num < sequence_length:
                ret, frame = cap.read()
                if not ret:
                    print(f"Failed to read frame from {video_file}")
                    break
                
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = holistic.process(frame_rgb)

                custom_results = CustomResults(
                    face_landmarks=create_filtered_face_landmarks(results.face_landmarks.landmark, selected_indices) if results.face_landmarks else None,
                    pose_landmarks=results.pose_landmarks,
                    left_hand_landmarks=results.left_hand_landmarks,
                    right_hand_landmarks=results.right_hand_landmarks
                )
                
                keypoints = extract_keypoints(custom_results)
                window.append(keypoints)
                
                frame_num += 1

        cap.release()

        if len(window) == sequence_length:
            sequences.append(window)
            labels.append(label_map[word])
            processed_videos.add(video_file)
            print(f"Added sequence for word '{word}', sequence {sequence}, label: {label_map[word]}")

            # 每次成功处理一个视频后，保存 sequence_counter 状态
            with open(SEQUENCE_COUNTER_PATH, 'w') as f:
                json.dump(sequence_counter, f)

# 保存处理过的视频路径到 txt 文件
with open(PROCESSED_VIDEOS_PATH, 'w') as f:
    for video_path in processed_videos:
        f.write(f"{video_path}\n")

# 转换 `sequences` 和 `labels` 为 NumPy 数组后保存
X = np.array(sequences)
y = np.array(labels)
np.save(os.path.join(DATA_PATH, "X_velocity.npy"), X)
np.save(os.path.join(DATA_PATH, "y_velocity.npy"), y)

print("Total sequences:", len(sequences))
print("X 的形状:", X.shape)
print("y 的形状:", y.shape)


Skipping already processed video: I:\Ece496\custom_data\hello\hello_1_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_2_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_3_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_4_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_5_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_7_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_8_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_10_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_11_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_12_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_13_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_14_Bill.mp4
Skipping already processed video: I:\Ece496\cus

In [5]:
# 假设 label_map 是一个字典，形如 {"hello": 0, "goodbye": 1, ...}
# 创建反向映射：从 label 到单词
reverse_label_map = {v: k for k, v in label_map.items()}

# 打印每个 label 对应的单词
print("Label 对应的单词如下:")
for label, word in reverse_label_map.items():
    print(f"Label {label}: {word}")


Label 对应的单词如下:
Label 0: hello
Label 1: I or me
Label 2: father
Label 3: mother
Label 4: see u later


In [6]:
y = to_categorical(labels, num_classes=len(words)).astype(int)
y

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1]])

In [7]:
print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (510, 105, 1584)
y shape: (510, 5)


In [8]:

y = to_categorical(labels, num_classes=len(words)).astype(int)

# 先分割出 20% 的测试集
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 然后在剩下的 80% 中分割出 10% 的验证集
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.125, random_state=42  # 0.125 * 0.8 = 0.1
)

# 确认分割后的数据形状
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)


X_train shape: (357, 105, 1584)
X_val shape: (51, 105, 1584)
X_test shape: (102, 105, 1584)
y_train shape: (357, 5)
y_val shape: (51, 5)
y_test shape: (102, 5)


In [None]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.layers import Input, Add, TimeDistributed
from tensorflow.keras.callbacks import Callback, TensorBoard, EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt

# ====================== 1. 数据增强函数 ======================
def time_resample(sequence, factor):
    """时间重采样"""
    target_length = int(len(sequence) * factor)
    indices = np.linspace(0, len(sequence)-1, target_length)
    indices = np.round(indices).astype(int)
    return sequence[indices]

def spatial_transform(sequence):
    """空间变换"""
    seq = sequence.copy()
    scale = random.uniform(0.9, 1.1)
    for i in range(0, seq.shape[1], 3):
        seq[:, i] *= scale    # x坐标
        seq[:, i+1] *= scale  # y坐标
    return seq

def random_mask(sequence):
    """随机掩码"""
    seq = sequence.copy()
    seq_len = len(seq)
    window_size = random.randint(2, 4)
    start_idx = random.randint(0, seq_len - window_size)
    num_features = sequence.shape[1] // 3
    mask_features = random.sample(range(num_features), k=random.randint(2, 4))
    
    for feature_idx in mask_features:
        idx = feature_idx * 3
        seq[start_idx:start_idx+window_size, idx:idx+3] = 0
    return seq

def augment_dataset(sequences, labels):
    """静态数据增强"""
    # 首先确保输入数据是numpy数组
    sequences = np.array(sequences)
    labels = np.array(labels)
    
    augmented_sequences = []
    augmented_labels = []
    
    for sequence, label in zip(sequences, labels):
        # 确保sequence是正确的形状
        sequence = np.array(sequence, dtype=np.float32)
        
        # 原始数据
        augmented_sequences.append(sequence)
        augmented_labels.append(label)
        
        # 时间重采样
        for factor in [0.8, 1.2]:
            aug_seq = time_resample(sequence.copy(), factor)
            # 确保维度一致
            if aug_seq.shape[0] != sequence.shape[0]:
                # 使用插值调整到正确的长度
                indices = np.linspace(0, aug_seq.shape[0]-1, sequence.shape[0])
                indices = np.round(indices).astype(int)
                aug_seq = aug_seq[indices]
            augmented_sequences.append(aug_seq)
            augmented_labels.append(label)
        
        # 空间变换
        aug_seq = spatial_transform(sequence.copy())
        augmented_sequences.append(aug_seq)
        augmented_labels.append(label)
    
    # 转换为numpy数组前确保所有序列维度一致
    augmented_sequences = np.array(augmented_sequences, dtype=np.float32)
    augmented_labels = np.array(augmented_labels, dtype=np.float32)
    
    return augmented_sequences, augmented_labels

class SignLanguageDataset(tf.keras.utils.Sequence):
    """动态数据增强的数据集类"""
    def __init__(self, sequences, labels, batch_size=32, augment=True):
        self.sequences = sequences
        self.labels = labels
        self.batch_size = batch_size
        self.augment = augment
        self.indexes = np.arange(len(self.sequences))
        
        # 确保数据类型正确
        self.sequences = np.array(self.sequences, dtype=np.float32)
        self.labels = np.array(self.labels, dtype=np.float32)
    
    def __len__(self):
        return int(np.ceil(len(self.sequences) / self.batch_size))
    
    def __getitem__(self, idx):
        # 获取当前batch的索引
        batch_indexes = self.indexes[idx*self.batch_size:(idx+1)*self.batch_size]
        
        # 准备数据
        batch_sequences = self.sequences[batch_indexes].copy()
        batch_labels = self.labels[batch_indexes].copy()
        
        # 应用动态增强
        if self.augment:
            for i in range(len(batch_sequences)):
                if random.random() < 0.5:
                    batch_sequences[i] = random_mask(batch_sequences[i])
        
        # 转换为tensorflow张量
        return tf.convert_to_tensor(batch_sequences, dtype=tf.float32), \
               tf.convert_to_tensor(batch_labels, dtype=tf.float32)
    
    def on_epoch_end(self):
        # 每个epoch结束时打乱数据
        if self.augment:
            np.random.shuffle(self.indexes)

# ====================== 2. 模型定义 ======================
def create_original_model(input_shape, num_classes):
    """创建原始LSTM模型"""
    model = Sequential([
        LSTM(128, return_sequences=True, activation='tanh', input_shape=input_shape),
        Dropout(0.2),
        LSTM(64, return_sequences=True, activation='tanh'),
        Dropout(0.2),
        LSTM(32, activation='tanh'),
        Dense(64),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dense(32),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_residual_model(input_shape, num_classes):
    """创建残差LSTM模型"""
    inputs = Input(shape=input_shape)
    
    # 第一个LSTM块
    x = LSTM(128, return_sequences=True, activation='tanh')(inputs)
    shortcut = TimeDistributed(Dense(128))(inputs)
    x = Add()([x, shortcut])
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    
    # 第二个LSTM块
    lstm_out = LSTM(64, return_sequences=True, activation='tanh')(x)
    shortcut = TimeDistributed(Dense(64))(x)
    x = Add()([lstm_out, shortcut])
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    
    # 最后的LSTM和全连接层
    x = LSTM(32, activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dense(64)(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dense(32)(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# ====================== 3. 训练相关 ======================
class TestCallback(Callback):
    """测试集评估回调"""
    def __init__(self, model, X_test, y_test, model_name=""):
        super(TestCallback, self).__init__()
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
        self.best_test_acc = 0
        self.model_name = model_name
        
    def on_epoch_end(self, epoch, logs=None):
        test_loss, test_acc = self.model.evaluate(self.X_test, self.y_test, verbose=0)
        print(f'{self.model_name} - Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}')
        if test_acc > self.best_test_acc:
            self.best_test_acc = test_acc
            print(f'{self.model_name} - New best test accuracy: {test_acc:.4f}')

def create_callbacks(model, model_name, X_test, y_test, log_dir):
    """创建回调函数"""
    return [
        TensorBoard(log_dir=log_dir),
        EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1),
        TestCallback(model, X_test, y_test, model_name)
    ]

def plot_models_comparison(original_history, residual_history):
    """绘制模型比较图"""
    plt.figure(figsize=(15, 5))
    
    # 准确率对比
    plt.subplot(1, 2, 1)
    plt.plot(original_history.history['accuracy'], label='Original Train')
    plt.plot(original_history.history['val_accuracy'], label='Original Val')
    plt.plot(residual_history.history['accuracy'], label='Residual Train')
    plt.plot(residual_history.history['val_accuracy'], label='Residual Val')
    plt.title('Model Accuracy Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # 损失对比
    plt.subplot(1, 2, 2)
    plt.plot(original_history.history['loss'], label='Original Train')
    plt.plot(original_history.history['val_loss'], label='Original Val')
    plt.plot(residual_history.history['loss'], label='Residual Train')
    plt.plot(residual_history.history['val_loss'], label='Residual Val')
    plt.title('Model Loss Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.show()

# ====================== 4. 主训练函数 ======================
def train_models_comparison(X_train, y_train, X_val, y_val, X_test, y_test):
    """主训练函数"""
    input_shape = (105, 1584)
    num_classes = len(words)
    batch_size = 32
    
    # 创建模型
    original_model = create_original_model(input_shape, num_classes)
    residual_model = create_residual_model(input_shape, num_classes)
    
    # 首先确保输入数据的类型和形状正确
    X_train = np.array(X_train, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.float32)
    X_val = np.array(X_val, dtype=np.float32)
    y_val = np.array(y_val, dtype=np.float32)
    X_test = np.array(X_test, dtype=np.float32)
    y_test = np.array(y_test, dtype=np.float32)
    
    print("Initial shapes:")
    print(f"X_train shape: {X_train.shape}")
    print(f"y_train shape: {y_train.shape}")
    
    # 数据增强
    print("\nApplying data augmentation...")
    X_train_aug, y_train_aug = augment_dataset(X_train, y_train)
    print(f"Dataset size before augmentation: {len(X_train)}")
    print(f"Dataset size after augmentation: {len(X_train_aug)}")
    print(f"Augmented shapes:")
    print(f"X_train_aug shape: {X_train_aug.shape}")
    print(f"y_train_aug shape: {y_train_aug.shape}")
    
    # 创建数据集
    train_dataset = SignLanguageDataset(
        sequences=X_train_aug,
        labels=y_train_aug,
        batch_size=batch_size,
        augment=True
    )
    
    val_dataset = SignLanguageDataset(
        sequences=X_val,
        labels=y_val,
        batch_size=batch_size,
        augment=False
    )
    # 创建日志目录
    log_dir_original = os.path.join('Logs', 'original_model')
    log_dir_residual = os.path.join('Logs', 'residual_model')
    os.makedirs(log_dir_original, exist_ok=True)
    os.makedirs(log_dir_residual, exist_ok=True)
    
    # 训练原始模型
    print("\nTraining Original Model...")
    original_history = original_model.fit(
        train_dataset,
        epochs=100,
        validation_data=val_dataset,
        callbacks=create_callbacks(original_model, "Original Model", X_test, y_test, log_dir_original),
        verbose=1
    )
    
    # 训练残差模型
    print("\nTraining Residual Model...")
    residual_history = residual_model.fit(
        train_dataset,
        epochs=100,
        validation_data=val_dataset,
        callbacks=create_callbacks(residual_model, "Residual Model", X_test, y_test, log_dir_residual),
        verbose=1
    )
    
    # 保存模型
    original_model.save_weights("original_model.h5")
    residual_model.save_weights("residual_model.h5")
    
    # 绘制比较图
    plot_models_comparison(original_history, residual_history)
    
    return original_model, residual_model

# ====================== 5. 执行训练 ======================
if __name__ == "__main__":
    print("Starting model comparison training...")
    original_model, residual_model = train_models_comparison(
        X_train, y_train,
        X_val, y_val,
        X_test, y_test
    )

In [24]:
model_with_velocity.save_weights(r"I:\Ece496\custom_data\weight1_with_velocity.h5")

In [12]:
TEST_VIDEO_FOLDER = "I:\\Ece496\\custom_data\\realtime_test"
video_files = glob(os.path.join(TEST_VIDEO_FOLDER, "*.mp4"))

model_with_velocity.load_weights(r"I:\Ece496\custom_data\weight1_with_velocity.h5")
mp_holistic = mp.solutions.holistic

# 使用 MediaPipe 处理视频
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for video_file in video_files:
        video_name = os.path.basename(video_file)
        print(f"Processing video: {video_name}")

        cap = cv2.VideoCapture(video_file)
        sequence = []  # 存储每帧的关键点
        predicted_labels = []  # 存储每帧的预测标签

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # 转换颜色空间并处理帧
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = holistic.process(frame_rgb)  # 使用已转换的RGB帧

            # 创建 custom_results 字典，确保传入正确的参数
            # 创建 custom_results 字典，确保传入正确的参数
            # Create custom results with filtered landmarks
            custom_results = CustomResults(
                face_landmarks=create_filtered_face_landmarks(results.face_landmarks.landmark, selected_indices) if results.face_landmarks else None,
                pose_landmarks=results.pose_landmarks,
                left_hand_landmarks=results.left_hand_landmarks,
                right_hand_landmarks=results.right_hand_landmarks
            )

            # Extract keypoints and ensure correct input shape
            keypoints = extract_keypoints(custom_results)


            if keypoints is not None:
                sequence.append(keypoints)

            # 检查是否达到 105 帧
            if len(sequence) == 105:
                prediction = model_with_velocity.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
                label = np.argmax(prediction)
                predicted_labels.append(label)
                sequence = []  # 清空 sequence 以便下一段 105 帧

        cap.release()

        # 统计出现最多的标签作为最终预测结果
        if predicted_labels:
            final_prediction = max(set(predicted_labels), key=predicted_labels.count)
            print(f"Video '{video_name}' processed. Final predicted label: {final_prediction} (Action: {words[final_prediction]})\n")
        else:
            print(f"Video '{video_name}' processed. No prediction available.\n")


Processing video: father_1_Bill.mp4
Video 'father_1_Bill.mp4' processed. Final predicted label: 2 (Action: father)

Processing video: father_2_Bill.mp4
Video 'father_2_Bill.mp4' processed. Final predicted label: 2 (Action: father)

Processing video: father_3_Bill.mp4
Video 'father_3_Bill.mp4' processed. Final predicted label: 2 (Action: father)

Processing video: see you later_4_Bill.mp4
Video 'see you later_4_Bill.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: see you later_5_Bill.mp4
Video 'see you later_5_Bill.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: father_6_Bill.mp4
Video 'father_6_Bill.mp4' processed. Final predicted label: 2 (Action: father)

Processing video: father_7_Bill.mp4
Video 'father_7_Bill.mp4' processed. Final predicted label: 2 (Action: father)

Processing video: hello_8_Bill.mp4
Video 'hello_8_Bill.mp4' processed. Final predicted label: 0 (Action: hello)

Processing video: mother_9_Bill.mp4
V

In [10]:
class PredictionSmoother:
    def __init__(self, window_size=10, confidence_threshold=0.75):
        self.window_size = window_size
        self.confidence_threshold = confidence_threshold
        self.predictions_history = []
        self.last_stable_prediction = None
        
    def update(self, new_prediction):
        # 添加新预测到历史记录
        self.predictions_history.append(new_prediction)
        
        # 保持窗口大小固定
        if len(self.predictions_history) > self.window_size:
            self.predictions_history.pop(0)
            
        # 计算平滑预测
        if len(self.predictions_history) >= 3:  # 至少需要几个预测才开始输出
            # 使用指数加权平均
            weights = np.exp(np.linspace(-1, 0, len(self.predictions_history)))
            weights /= weights.sum()  # 归一化权重
            
            # 计算加权平均
            smooth_prediction = np.average(
                self.predictions_history, 
                weights=weights,
                axis=0
            )
            
            # 如果最高概率超过阈值，更新稳定预测
            max_prob = np.max(smooth_prediction)
            if max_prob > self.confidence_threshold:
                self.last_stable_prediction = smooth_prediction
                
            return smooth_prediction, self.last_stable_prediction
            
        return None, None

    def reset(self):
        """重置预测器状态"""
        self.predictions_history = []
        self.last_stable_prediction = None

In [11]:
import numpy as np
import cv2
import mediapipe as mp

# 初始化
model_with_velocity.load_weights(r"I:\Ece496\custom_data\weight1_with_velocity.h5")
mp_holistic = mp.solutions.holistic

# 设置滑动窗口
window_size = 105
num_keypoints = 1584  # 每帧关键点数量
sequence = np.zeros((window_size, num_keypoints), dtype=float)
required_non_zero_ratio = 0.9  # 设置至少90%的帧要非零
frame_count = 0  # 初始化帧计数器

# 初始化预测平滑器
smoother = PredictionSmoother(window_size=10, confidence_threshold=0.75)

# 打开摄像头
cap = cv2.VideoCapture(0)
predicted_labels = []
last_predictions = None

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(frame_rgb)

        custom_results = CustomResults(
            face_landmarks=create_filtered_face_landmarks(results.face_landmarks.landmark, selected_indices) if results.face_landmarks else None,
            pose_landmarks=results.pose_landmarks,
            left_hand_landmarks=results.left_hand_landmarks,
            right_hand_landmarks=results.right_hand_landmarks
        )

        keypoints = extract_keypoints(custom_results)

        if keypoints is not None:
            # 更新关键点序列
            sequence[:-1] = sequence[1:]
            sequence[-1] = keypoints
            frame_count += 1

            # 跳过前105帧
            if frame_count < window_size:
                print(f"Skipping frame {frame_count}")
                continue

            # 检查滑动窗口中非零帧的比例
            non_zero_frames = np.count_nonzero(np.any(sequence != 0, axis=1))
            if non_zero_frames / window_size >= required_non_zero_ratio:
                # 获取原始预测
                raw_prediction = model_with_velocity.predict(
                    np.expand_dims(sequence, axis=0), 
                    verbose=0
                )[0]
                
                # 使用平滑器处理预测结果
                smooth_pred, stable_pred = smoother.update(raw_prediction)
                
                if smooth_pred is not None:
                    # 使用平滑后的预测更新显示
                    last_predictions = smooth_pred
                    
                    # 如果有稳定预测，打印结果
                    if stable_pred is not None:
                        predicted_class = np.argmax(stable_pred)
                        confidence = stable_pred[predicted_class]
                        print(f"Stable Prediction: {words[predicted_class]} ({confidence:.2f})")

        # 在帧上显示预测概率
        if last_predictions is not None:
            for idx, prob in enumerate(last_predictions):
                # 添加颜色编码：高置信度显示绿色，低置信度显示红色
                if prob > 0.75:
                    color = (0, 255, 0)  # 绿色
                elif prob > 0.5:
                    color = (0, 255, 255)  # 黄色
                else:
                    color = (0, 0, 255)  # 红色
                    
                text = f"{words[idx]}: {prob:.2f}"
                cv2.putText(frame, text, (10, 30 + idx * 30), 
                          cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2, cv2.LINE_AA)

        cv2.imshow('MediaPipe Holistic', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


Skipping frame 1
Skipping frame 2
Skipping frame 3
Skipping frame 4
Skipping frame 5
Skipping frame 6
Skipping frame 7
Skipping frame 8
Skipping frame 9
Skipping frame 10
Skipping frame 11
Skipping frame 12
Skipping frame 13
Skipping frame 14
Skipping frame 15
Skipping frame 16
Skipping frame 17
Skipping frame 18
Skipping frame 19
Skipping frame 20
Skipping frame 21
Skipping frame 22
Skipping frame 23
Skipping frame 24
Skipping frame 25
Skipping frame 26
Skipping frame 27
Skipping frame 28
Skipping frame 29
Skipping frame 30
Skipping frame 31
Skipping frame 32
Skipping frame 33
Skipping frame 34
Skipping frame 35
Skipping frame 36
Skipping frame 37
Skipping frame 38
Skipping frame 39
Skipping frame 40
Skipping frame 41
Skipping frame 42
Skipping frame 43
Skipping frame 44
Skipping frame 45
Skipping frame 46
Skipping frame 47
Skipping frame 48
Skipping frame 49
Skipping frame 50
Skipping frame 51
Skipping frame 52
Skipping frame 53
Skipping frame 54
Skipping frame 55
Skipping frame 56
S

In [11]:
import numpy as np
import cv2
import mediapipe as mp
import time

# MediaPipe初始化
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

class MotionDetector:
    def __init__(self, window_size=10, motion_threshold=0.003):  # 降低阈值到0.003
        self.window_size = window_size
        self.motion_threshold = motion_threshold
        self.frame_buffer = []
        
    def calculate_motion(self, keypoints):
        """
        更新motion计算方法，使其更敏感
        """
        # 提取手部和面部关键点
        face_points = keypoints[:90*3]  
        left_hand = keypoints[411:411+63]
        right_hand = keypoints[474:474+63]
        
        # 给予手部更高的权重
        weighted_points = np.concatenate([
            face_points * 0.5,    # 降低面部权重
            left_hand * 2.0,      # 增加手部权重
            right_hand * 2.0
        ])
        
        if not self.frame_buffer:
            motion = 0
        else:
            prev_frame = self.frame_buffer[-1]
            diff = weighted_points - prev_frame
            # 使用最大位移而不是平均位移
            motion = np.max(np.abs(diff))
        
        self.frame_buffer.append(weighted_points)
        if len(self.frame_buffer) > self.window_size:
            self.frame_buffer.pop(0)
            
        return motion

class GestureRecognizer:
    def __init__(self, 
                 motion_threshold=0.003,  # 降低阈值
                 hold_time=0.3,          # 减少保持时间
                 smooth_window=5,
                 transition_delay=0.1):   # 减少延迟
        self.motion_detector = MotionDetector(motion_threshold=motion_threshold)
        self.predictor = PredictionSmoother(window_size=smooth_window)
        self.state = 'WAITING'
        self.last_stable_gesture = None
        self.hold_start_time = None
        self.hold_threshold = hold_time
        self.transition_delay = transition_delay
        self.last_state_change = time.time()
        self.consecutive_low_motion = 0
        self.consecutive_high_motion = 0
        
        self.debug_info = {
            'motion_level': 0,
            'state': self.state,
            'time_in_state': 0,
            'consecutive_low': 0,
            'consecutive_high': 0
        }
        
    def update(self, sequence):
        current_time = time.time()
        motion_level = self.motion_detector.calculate_motion(sequence[-1])
        self.debug_info['motion_level'] = motion_level
        
        # 更新连续计数，降低要求
        if motion_level > self.motion_detector.motion_threshold:
            self.consecutive_high_motion += 1
            self.consecutive_low_motion = max(0, self.consecutive_low_motion - 2)  # 快速减少low count
        else:
            self.consecutive_low_motion += 1
            self.consecutive_high_motion = max(0, self.consecutive_high_motion - 1)  # 缓慢减少high count
            
        self.debug_info['consecutive_low'] = self.consecutive_low_motion
        self.debug_info['consecutive_high'] = self.consecutive_high_motion

        if self.state == 'WAITING':
            # 降低进入DETECTING的要求
            if self.consecutive_high_motion >= 2:  # 只需要2帧高运动
                self.state = 'DETECTING'
                self.last_state_change = current_time
                self.predictor.reset()
                
        elif self.state == 'DETECTING':
            raw_prediction = model_with_velocity.predict(
                np.expand_dims(sequence, axis=0), verbose=0)[0]
            
            smooth_pred, stable_pred = self.predictor.update(raw_prediction)
            
            # 降低进入HOLDING的要求
            if self.consecutive_low_motion >= 3:  # 只需要3帧低运动
                if stable_pred is not None:
                    self.last_stable_gesture = stable_pred
                    self.state = 'HOLDING'
                    self.hold_start_time = current_time
                    self.last_state_change = current_time
                    
        elif self.state == 'HOLDING':
            if self.consecutive_high_motion >= 2:
                self.state = 'DETECTING'
                self.last_state_change = current_time
                self.predictor.reset()
            elif current_time - self.hold_start_time > self.hold_threshold:
                self.state = 'WAITING'
                self.last_state_change = current_time
        
        self.debug_info['state'] = self.state
        self.debug_info['time_in_state'] = current_time - self.last_state_change
        
        return self.last_stable_gesture

def draw_debug_info(frame, recognizer):
    info = recognizer.debug_info
    
    # 显示动作量和阈值
    motion_bar_length = min(int(info['motion_level'] * 1000), 200)
    threshold_bar_length = int(recognizer.motion_detector.motion_threshold * 1000)
    
    # 动作量条
    cv2.rectangle(frame, (10, 400), (10 + motion_bar_length, 420), 
                 (0, 255, 0), -1)
    # 阈值线
    cv2.line(frame, (10 + threshold_bar_length, 395), 
             (10 + threshold_bar_length, 425), (0, 0, 255), 2)
    
    cv2.putText(frame, f"Motion: {info['motion_level']:.3f}", 
                (220, 415), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
    
    # 显示连续计数
    cv2.putText(frame, f"Consecutive low: {info['consecutive_low']}", 
                (10, 440), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
    cv2.putText(frame, f"Consecutive high: {info['consecutive_high']}", 
                (10, 460), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
    
    # 显示状态
    state_colors = {
        'WAITING': (255, 255, 255),
        'DETECTING': (0, 255, 255),
        'HOLDING': (0, 255, 0)
    }
    cv2.putText(frame, f"State: {info['state']}", 
                (10, 480), cv2.FONT_HERSHEY_SIMPLEX, 0.6, 
                state_colors[info['state']], 2)
    
    cv2.putText(frame, f"Time in state: {info['time_in_state']:.1f}s",
                (10, 500), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
def main():
    # 初始化参数
    window_size = 105
    num_keypoints = 1584
    sequence = np.zeros((window_size, num_keypoints), dtype=float)
    frame_count = 0
    last_predictions = None

    # 加载模型
    model_with_velocity.load_weights(r"I:\Ece496\custom_data\weight1_with_velocity.h5")
    
    # 初始化识别器
    recognizer = GestureRecognizer(
        motion_threshold=0.05,
        hold_time=1.0,
        smooth_window=10,
        transition_delay=0.5
    )

    # 打开摄像头
    cap = cv2.VideoCapture(0)
    
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = holistic.process(frame_rgb)

            custom_results = CustomResults(
                face_landmarks=create_filtered_face_landmarks(results.face_landmarks.landmark, selected_indices) 
                    if results.face_landmarks else None,
                pose_landmarks=results.pose_landmarks,
                left_hand_landmarks=results.left_hand_landmarks,
                right_hand_landmarks=results.right_hand_landmarks
            )

            keypoints = extract_keypoints(custom_results)

            if keypoints is not None:
                sequence[:-1] = sequence[1:]
                sequence[-1] = keypoints
                frame_count += 1

                if frame_count >= window_size:
                    prediction = recognizer.update(sequence)
                    if prediction is not None:
                        last_predictions = prediction

            if last_predictions is not None:
                for idx, prob in enumerate(last_predictions):
                    text = f"{words[idx]}: {prob:.2f}"
                    color = (0, 255, 0) if prob > 0.75 else (0, 255, 255)
                    cv2.putText(frame, text, (10, 30 + idx * 30), 
                              cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

            draw_debug_info(frame, recognizer)
            cv2.imshow('MediaPipe Holistic', frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == '__main__':
    main()