In [1]:
import sys
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2
from dataclasses import dataclass, field
from typing import List, Optional
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from glob import glob
import random
import json
from dataclasses import dataclass
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic

In [2]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections
def draw_styled_landmarks(image, results):
    # 绘制面部关键点，不使用连接线
    if results.face_landmarks:
        mp_drawing.draw_landmarks(
            image,
            results.face_landmarks,
            None,  # 不使用连接线
            mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
            mp_drawing.DrawingSpec(color=(80, 256, 121), thickness=1, circle_radius=1)
        )
    
    # 绘制姿势关键点，使用连接线
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(
            image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2)
        )
    
    # 绘制左手关键点，使用连接线
    if results.left_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
        )
    
    # 绘制右手关键点，使用连接线
    if results.right_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2)
        )

def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [3]:
# 动作类别和参数
actions = ['hello', 'I or me', 'father', 'mother', 'see u later']
sequence_length = 30*3.5  # 每个视频的帧数

In [4]:

# 初始化路径
DATA_PATH = "I:\\Ece496\\custom_data"
PROCESSED_VIDEOS_PATH = os.path.join(DATA_PATH, "processed_videos_normal.txt")
SEQUENCE_COUNTER_PATH = os.path.join(DATA_PATH, "sequence_counter_normal.json")


# 加载之前保存的关键点数据
if os.path.exists(os.path.join(DATA_PATH, "X_normal.npy")) and os.path.exists(os.path.join(DATA_PATH, "y_normal.npy")):
    X = np.load(os.path.join(DATA_PATH, "X_normal.npy"))
    y = np.load(os.path.join(DATA_PATH, "y_normal.npy"))
    sequences = list(X)  # 转为列表，以便后续可以追加新数据
    labels = list(y)  # 转为列表
else:
    sequences = []
    labels = []

# 加载已处理的视频路径
if os.path.exists(PROCESSED_VIDEOS_PATH):
    with open(PROCESSED_VIDEOS_PATH, 'r') as f:
        processed_videos = set(line.strip() for line in f)
else:
    processed_videos = set()

# 加载 sequence_counter 状态
if os.path.exists(SEQUENCE_COUNTER_PATH):
    with open(SEQUENCE_COUNTER_PATH, 'r') as f:
        sequence_counter = json.load(f)
else:
    sequence_counter = {action: 0 for action in actions}  # 初始化计数器

# 读取视频文件并提取关键点
for action in actions:
    video_files = glob(os.path.join(DATA_PATH, action, "*.mp4"))
    
    for video_file in video_files:
        if video_file in processed_videos:
            print(f"Skipping already processed video: {video_file}")
            continue

        # 每次处理一个新的文件，就增加该 action 的计数
        sequence_counter[action] += 1
        sequence = sequence_counter[action]  # 获取当前 action 的 sequence 编号

        print(f"Processing video: {video_file}, sequence: {sequence}")

        # 处理视频文件
        cap = cv2.VideoCapture(video_file)
        if not cap.isOpened():
            print(f"Cannot open video file: {video_file}")
            continue
        
        window = []

        with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
            frame_num = 0
            while frame_num < sequence_length:
                ret, frame = cap.read()
                if not ret:
                    print(f"Failed to read frame from {video_file}")
                    break
                
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = holistic.process(frame_rgb)

                # 提取并保存关键点
                keypoints = extract_keypoints(results)
                window.append(keypoints)
                
                frame_num += 1

        cap.release()

        if len(window) == sequence_length:
            sequences.append(window)
            labels.append(action)
            processed_videos.add(video_file)
            print(f"Added sequence for action '{action}', sequence {sequence}")

            # 每次成功处理一个视频后，保存 sequence_counter 状态
            with open(SEQUENCE_COUNTER_PATH, 'w') as f:
                json.dump(sequence_counter, f)

# 保存处理过的视频路径到 txt 文件
with open(PROCESSED_VIDEOS_PATH, 'w') as f:
    for video_path in processed_videos:
        f.write(f"{video_path}\n")

# 转换 sequences 和 labels 为 NumPy 数组并保存
X = np.array(sequences)
y = np.array(labels)
np.save(os.path.join(DATA_PATH, "X_normal.npy"), X)
np.save(os.path.join(DATA_PATH, "y_normal.npy"), y)

print("Total sequences:", len(sequences))
print("X 的形状:", X.shape)
print("y 的形状:", y.shape)


Skipping already processed video: I:\Ece496\custom_data\hello\hello_1_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_2_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_3_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_4_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_5_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_7_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_8_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_10_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_11_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_12_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_13_Bill.mp4
Skipping already processed video: I:\Ece496\custom_data\hello\hello_14_Bill.mp4
Skipping already processed video: I:\Ece496\cus

In [5]:
# 创建标签到整数的映射表
label_map = {action: i for i, action in enumerate(actions)}
y = [label_map[label] for label in y]  # 将标签转换为整数索引

# 然后再转换为独热编码
y = to_categorical(y, num_classes=len(actions))

y

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [7]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [8]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='tanh', input_shape=(105,1662)))
model.add(LSTM(128, return_sequences=True, activation='tanh'))
model.add(LSTM(64, return_sequences=False, activation='tanh'))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='tanh'))
model.add(Dense(len(actions), activation='softmax'))

In [9]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [10]:
model.fit(X_train, y_train, epochs=200, callbacks=[tb_callback])

MemoryError: Unable to allocate 146. MiB for an array with shape (220, 105, 1662) and data type float32

In [None]:
res = model.predict(X_test)



In [None]:
actions[np.argmax(res[6])]

'father'

In [None]:
actions[np.argmax(y_test[6])]

'father'

In [None]:
model.save('action.h5')

In [None]:
model.load_weights('action.h5')

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [None]:
yhat = model.predict(X_test)



In [None]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[ 7,  0],
        [ 0,  5]],

       [[11,  0],
        [ 0,  1]],

       [[ 8,  0],
        [ 0,  4]],

       [[11,  0],
        [ 0,  1]],

       [[11,  0],
        [ 0,  1]]], dtype=int64)

In [None]:
accuracy_score(ytrue, yhat)

1.0

In [None]:
import random
colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in range(len(actions))]

def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [None]:
TEST_VIDEO_FOLDER = "I:\\Ece496\\custom_data\\realtime_test"
mp_holistic = mp.solutions.holistic
video_files = glob(os.path.join(TEST_VIDEO_FOLDER, "*.mp4"))

video_files = glob(os.path.join(TEST_VIDEO_FOLDER, "*.mp4"))

# 处理每个视频
for video_file in video_files:
    video_name = os.path.basename(video_file)
    print(f"Processing video: {video_name}")

    cap = cv2.VideoCapture(video_file)
    sequence = []  # 存储每帧的关键点
    predicted_labels = []  # 存储每帧的预测标签

    # 使用 MediaPipe 处理视频
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # 转换颜色空间并处理帧
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = holistic.process(frame_rgb)

            # 提取关键点并累加到序列中
            keypoints = extract_keypoints(results)
            if keypoints is not None:
                sequence.append(keypoints)

            # 检查是否达到 105 帧
            if len(sequence) == 105:
                prediction = model.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
                label = np.argmax(prediction)
                predicted_labels.append(label)

                # 清空 sequence 以便下一段 105 帧
                sequence = []

    cap.release()

    # 统计出现最多的标签作为最终预测结果
    if predicted_labels:
        final_prediction = max(set(predicted_labels), key=predicted_labels.count)
        print(f"Video '{video_name}' processed. Final predicted label: {final_prediction} (Action: {actions[final_prediction]})\n")
    else:
        print(f"Video '{video_name}' processed. No prediction available.\n")

Processing video: see u later_1_Bill.mp4
Video 'see u later_1_Bill.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: see u later_2_Bill.mp4
Video 'see u later_2_Bill.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: see u later_3_Bill.mp4
Video 'see u later_3_Bill.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: see u later_7_Bill.mp4
Video 'see u later_7_Bill.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: see u later_8_Bill.mp4
Video 'see u later_8_Bill.mp4' processed. Final predicted label: 4 (Action: see u later)

Processing video: see u later_9_Bill.mp4
Video 'see u later_9_Bill.mp4' processed. Final predicted label: 4 (Action: see u later)



In [None]:
# 1. New detection variables
sequence = []
sentence = []
threshold = 0.8

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
#         sequence.insert(0,keypoints)
#         sequence = sequence[:30]
        sequence.append(keypoints)
        sequence = sequence[-105:]
        
        if len(sequence) == 105:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            
            
        #3. Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

NameError: name 'cv2' is not defined

In [None]:
import numpy as np
import cv2
import mediapipe as mp

# 初始化变量
sequence = []
sentence = []
threshold = 0.8
window_size = 105  # 滑动窗口大小

cap = cv2.VideoCapture(0)

# 设置 MediaPipe 模型
with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        # 读取视频帧
        ret, frame = cap.read()
        if not ret:
            break

        # 进行 MediaPipe 检测
        image, results = mediapipe_detection(frame, holistic)
        
        # 绘制关键点
        draw_styled_landmarks(image, results)

        # 提取关键点并更新滑动窗口
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-window_size:]  # 保持序列长度为 105
        
        # 预测逻辑
        if len(sequence) == window_size:
            res = model.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
            max_action = actions[np.argmax(res)]
            print("Detected Action:", max_action)
            
            # 在图像上显示每个类别的概率
            for idx, prob in enumerate(res):
                text = f"{actions[idx]}: {prob:.2f}"
                cv2.putText(image, text, (10, 30 + idx * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2, cv2.LINE_AA)
            
            # 更新句子显示
            if res[np.argmax(res)] > threshold:
                if len(sentence) > 0:
                    if max_action != sentence[-1]:
                        sentence.append(max_action)
                else:
                    sentence.append(max_action)
            
            # 控制句子的长度
            if len(sentence) > 5:
                sentence = sentence[-5:]

            # 可视化概率
            image = prob_viz(res, actions, image, colors)

        # 在图像上显示句子
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # 显示视频帧
        cv2.imshow('OpenCV Feed', image)

        # 退出条件
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()
