In [None]:
from ultralytics import YOLO
import cv2
import math 
# start webcam
cap = cv2.VideoCapture(0)
cap.set(3, 1280)
cap.set(4, 1280)

# model
model = YOLO("yolo-Weights/yolov8n.pt")

# object classes
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]


while True:
    success, img = cap.read()
    results = model(img, stream=True)

    # coordinates
    for r in results:
        boxes = r.boxes

        for box in boxes:
            # bounding box
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values

            # put box in cam
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)

            # confidence
            confidence = math.ceil((box.conf[0]*100))/100
            print("Confidence --->",confidence)

            # class name
            cls = int(box.cls[0])
            print("Class name -->", classNames[cls])

            # object details
            org = [x1, y1]
            font = cv2.FONT_HERSHEY_SIMPLEX
            fontScale = 1
            color = (255, 0, 0)
            thickness = 2

            cv2.putText(img, classNames[cls], org, font, fontScale, color, thickness)

    cv2.imshow('Webcam', img)
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolo-Weights/yolov8n.pt'...


100%|██████████████████████████████████████| 6.25M/6.25M [00:00<00:00, 11.5MB/s]



0: 384x640 1 person, 47.3ms
Confidence ---> 0.92
Class name --> person
Speed: 1.7ms preprocess, 47.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 101.8ms
Confidence ---> 0.92
Class name --> person
Speed: 19.7ms preprocess, 101.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 59.5ms
Confidence ---> 0.92
Class name --> person
Speed: 1.4ms preprocess, 59.5ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 49.2ms
Confidence ---> 0.92
Class name --> person
Speed: 1.3ms preprocess, 49.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 51.8ms
Confidence ---> 0.92
Class name --> person
Speed: 1.3ms preprocess, 51.8ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 50.7ms
Confidence ---> 0.92
Class name --> person
Speed: 1.2ms preprocess, 50.7ms inference, 0.4ms postprocess per image at shape 


0: 480x640 (no detections), 50.2ms
Speed: 2.0ms preprocess, 50.2ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 53.3ms
Speed: 2.0ms preprocess, 53.3ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 57.5ms
Speed: 2.0ms preprocess, 57.5ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 58.9ms
Speed: 2.0ms preprocess, 58.9ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 57.9ms
Speed: 2.3ms preprocess, 57.9ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 56.5ms
Speed: 2.1ms preprocess, 56.5ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 57.1ms
Speed: 2.1ms preprocess, 57.1ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 56.7ms
Speed: 2.1ms preprocess, 56.7ms i


0: 480x640 (no detections), 69.1ms
Speed: 1.9ms preprocess, 69.1ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.8ms
Speed: 2.1ms preprocess, 59.8ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.7ms
Speed: 2.2ms preprocess, 55.7ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 56.5ms
Speed: 1.9ms preprocess, 56.5ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 57.2ms
Speed: 2.3ms preprocess, 57.2ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.4ms
Speed: 2.1ms preprocess, 55.4ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 51.9ms
Speed: 1.7ms preprocess, 51.9ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 60.6ms
Speed: 2.1ms preprocess, 60.6ms i


0: 480x640 (no detections), 59.1ms
Speed: 2.5ms preprocess, 59.1ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.4ms
Speed: 2.3ms preprocess, 59.4ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 62.0ms
Speed: 2.6ms preprocess, 62.0ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 49.3ms
Speed: 1.8ms preprocess, 49.3ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 49.0ms
Speed: 2.3ms preprocess, 49.0ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 53.4ms
Speed: 1.8ms preprocess, 53.4ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.7ms
Speed: 2.3ms preprocess, 59.7ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.7ms
Speed: 1.8ms preprocess, 59.7ms i


0: 480x640 (no detections), 58.6ms
Speed: 1.8ms preprocess, 58.6ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.4ms
Speed: 1.7ms preprocess, 55.4ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 57.2ms
Speed: 2.3ms preprocess, 57.2ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 54.4ms
Speed: 1.8ms preprocess, 54.4ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 51.4ms
Speed: 2.0ms preprocess, 51.4ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 56.0ms
Speed: 1.8ms preprocess, 56.0ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 52.8ms
Speed: 1.7ms preprocess, 52.8ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.6ms
Speed: 1.9ms preprocess, 55.6ms i


0: 480x640 (no detections), 54.2ms
Speed: 2.5ms preprocess, 54.2ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 53.0ms
Speed: 1.9ms preprocess, 53.0ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 49.7ms
Speed: 1.9ms preprocess, 49.7ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 54.8ms
Speed: 2.4ms preprocess, 54.8ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 58.8ms
Speed: 2.1ms preprocess, 58.8ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.1ms
Speed: 2.1ms preprocess, 59.1ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 50.8ms
Speed: 2.0ms preprocess, 50.8ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.0ms
Speed: 1.8ms preprocess, 55.0ms i


0: 480x640 (no detections), 51.7ms
Speed: 1.8ms preprocess, 51.7ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 53.4ms
Speed: 1.9ms preprocess, 53.4ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 52.9ms
Speed: 1.7ms preprocess, 52.9ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 56.9ms
Speed: 1.8ms preprocess, 56.9ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.3ms
Speed: 1.9ms preprocess, 55.3ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 61.2ms
Speed: 1.7ms preprocess, 61.2ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 53.1ms
Speed: 2.0ms preprocess, 53.1ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 50.7ms
Speed: 2.3ms preprocess, 50.7ms i


0: 480x640 (no detections), 61.1ms
Speed: 2.1ms preprocess, 61.1ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 64.5ms
Speed: 2.0ms preprocess, 64.5ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 65.4ms
Speed: 2.7ms preprocess, 65.4ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.4ms
Speed: 2.3ms preprocess, 55.4ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 54.7ms
Speed: 1.8ms preprocess, 54.7ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 53.5ms
Speed: 1.9ms preprocess, 53.5ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 61.0ms
Speed: 1.9ms preprocess, 61.0ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.0ms
Speed: 1.8ms preprocess, 55.0ms i


0: 480x640 (no detections), 54.5ms
Speed: 1.7ms preprocess, 54.5ms inference, 0.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.9ms
Speed: 1.8ms preprocess, 55.9ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 58.2ms
Speed: 1.9ms preprocess, 58.2ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 55.0ms
Speed: 2.2ms preprocess, 55.0ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 61.8ms
Speed: 2.0ms preprocess, 61.8ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 57.5ms
Speed: 1.8ms preprocess, 57.5ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 62.5ms
Speed: 1.9ms preprocess, 62.5ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.0ms
Speed: 2.0ms preprocess, 59.0ms i

In [None]:
import torch
import cv2
import torchvision.transforms as transforms
from PIL import Image as pil
import numpy as np
from collections import deque
from ultralytics import YOLO
import mediapipe as mp
from monodepth2 import monodepth2

# 初始化 MonoDepth2 模型
md = monodepth2()
encoder = md.encoder  # 載入編碼器
depth_decoder = md.depth_decoder  # 載入深度解碼器

# Mediapipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# 載入 YOLOv8 的預訓練模型
model_dispenser = YOLO('/Users/yangzhelun/Downloads/best.pt')  # 酒精機器模型
model_people = YOLO("yolov8n.pt")  # 官方人物模型

cap = cv2.VideoCapture(0)

# 定義移動平均的深度記錄隊列
depth_window_size = 10
person_depths = deque(maxlen=depth_window_size)
dispenser_depths = deque(maxlen=depth_window_size)

def preprocess_image(frame):
    """ 將影像轉換為張量，進行尺寸縮放 """
    input_image = pil.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    input_image = input_image.resize((640, 192), pil.Resampling.LANCZOS)
    input_image = transforms.ToTensor()(input_image).unsqueeze(0)
    return input_image

def resize_bounding_box(box, original_size, new_size):
    """將 bounding box 根據影像縮放調整大小"""
    x1, y1, x2, y2 = map(int, box)
    orig_w, orig_h = original_size
    new_w, new_h = new_size
    scale_w, scale_h = new_w / orig_w, new_h / orig_h
    return [int(x1 * scale_w), int(y1 * scale_h), int(x2 * scale_w), int(y2 * scale_h)]

def draw_box(frame, box, label, confidence, color):
    x1, y1, x2, y2 = map(int, box.xyxy[0])
    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
    cv2.putText(frame, f'{label}: {confidence:.2f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
    return [x1, y1, x2, y2]

def calculate_average_depth(depth_map, bounding_box):
    """根據 Bounding Box 計算區域內的平均深度"""
    if bounding_box is None:
        return None  # 如果 bounding_box 是 None，返回 None
    x1, y1, x2, y2 = bounding_box
    roi = depth_map[y1:y2, x1:x2]  # 取出 Bounding Box 內的深度區域
    avg_depth = np.mean(roi)  # 計算平均深度
    return avg_depth

def calculate_moving_average(depth_values):
    """計算移動平均值"""
    if len(depth_values) == 0:
        return None
    return sum(depth_values) / len(depth_values)

def is_person_near_dispenser(person_depth, dispenser_depth, threshold=3):
    """比較人物與酒精機器的深度，判斷是否接近"""
    if person_depth is None or dispenser_depth is None:
        return False  # 如果其中之一是 None，表示無法進行比較
    return abs(person_depth - dispenser_depth) < threshold

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    height, width, _ = frame.shape  # 取得影像的尺寸

    # 使用模型偵測酒精機器
    results_dispenser = model_dispenser(frame)
    
    # 使用模型偵測人物
    results_people = model_people(frame)

    dispenser_box = None
    person_boxes = []  # 用於保存多個人物的框

    # 處理 YOLO 結果
    for result, color, label_name, conf_threshold in [(results_people, (0, 0, 255), 'person', 0.8), (results_dispenser, (0, 255, 0), 'dispenser', 0.3)]:
        for r in result:
            for box in r.boxes:
                label = r.names[int(box.cls)]
                confidence = box.conf[0]  # 獲取信心程度
                if label == label_name and confidence > conf_threshold:
                    if label_name == 'person':
                        person_box = draw_box(frame, box, label, confidence, color)
                        person_boxes.append(person_box)  # 將多個人物框保存
                    elif label_name == 'dispenser':
                        dispenser_box = draw_box(frame, box, label, confidence, color)

    # Mediapipe 偵測人體姿勢
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pose_results = pose.process(image)  # 在這裡進行姿勢偵測

    # 深度檢測部分
    input_image = preprocess_image(frame).to(torch.device('cpu'))  # 確保在 CPU 上運行
    with torch.no_grad():
        features = encoder(input_image)
        outputs = depth_decoder(features)

    # 提取深度圖 (disparity map)
    depth_map = outputs[("disp", 0)].squeeze().cpu().numpy()  # 將張量轉換為 NumPy 數組
    depth_map = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX)  # 進行歸一化
    depth_map = np.uint8(depth_map)  # 將數據類型轉換為 uint8

    # 調整 YOLO 檢測到的 bounding box 尺寸，以適應 MonoDepth2 的深度圖大小
    if dispenser_box is not None:
        dispenser_box = resize_bounding_box(dispenser_box, (width, height), (640, 192))

    # 計算酒精機器的深度並保存至移動平均計算隊列
    dispenser_depth = calculate_average_depth(depth_map, dispenser_box)
    if dispenser_depth is not None:
        dispenser_depths.append(dispenser_depth)
    avg_dispenser_depth = calculate_moving_average(dispenser_depths)

    # 初始化顯示的狀態文字
    status_text = "No action detected"
    depth_text = ""

    # 處理多個人物的情況
    for person_box in person_boxes:
        person_box = resize_bounding_box(person_box, (width, height), (640, 192))
        person_depth = calculate_average_depth(depth_map, person_box)
        if person_depth is not None:
            person_depths.append(person_depth)
        avg_person_depth = calculate_moving_average(person_depths)

        # 判斷是否偵測到深度，並顯示數值
        if avg_person_depth is not None and avg_dispenser_depth is not None:
            depth_text = f"Person Depth: {avg_person_depth:.2f}, Dispenser Depth: {avg_dispenser_depth:.2f}"
        elif avg_person_depth is not None:
            depth_text = f"Person Depth: {avg_person_depth:.2f}, Dispenser Depth: N/A"
        elif avg_dispenser_depth is not None:
            depth_text = f"Person Depth: N/A, Dispenser Depth: {avg_dispenser_depth:.2f}"
        else:
            depth_text = "Person Depth: N/A, Dispenser Depth: N/A"

        # 判斷人物是否接近酒精機器
        if is_person_near_dispenser(avg_person_depth, avg_dispenser_depth):
            status_text = "Person near the dispenser"
            
            # 檢查鼻子位置是否進行額溫測量
            if pose_results and pose_results.pose_landmarks:
                nose = pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.NOSE]
                nose_x, nose_y = int(nose.x * width), int(nose.y * height)
                if dispenser_box and (dispenser_box[0] < nose_x < dispenser_box[2]) and (dispenser_box[1] < nose_y < dispenser_box[3]):
                    cv2.putText(frame, "Measuring Forehead Temperature", (50, 150), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 0, 255), 3, cv2.LINE_AA)

        # 顯示狀態文字：左上角提示字體，右下角顯示同一深度信息
        cv2.putText(frame, status_text, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), 3, cv2.LINE_AA)
        cv2.putText(frame, depth_text, (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 0), 2, cv2.LINE_AA)
    
    # 顯示結果
    cv2.imshow('YOLOv8 and Mediapipe - Dispenser Interaction Detection', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

GPU not visible; CPU mode
-> Loading model from  /Users/yangzhelun/.monodepth2_models/mono+stereo_640x192
   Loading pretrained encoder




   Loading pretrained decoder


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.



0: 384x640 (no detections), 45.8ms
Speed: 1.9ms preprocess, 45.8ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 46.7ms
Speed: 2.0ms preprocess, 46.7ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 59.2ms
Speed: 1.7ms preprocess, 59.2ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 73.6ms
Speed: 1.5ms preprocess, 73.6ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 48.9ms
Speed: 1.4ms preprocess, 48.9ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 54.6ms
Speed: 1.4ms preprocess, 54.6ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 42.7ms
Speed: 1.3ms preprocess, 42.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 47.1ms
Speed: 1.5ms preprocess, 47.1ms inference, 0.3ms postprocess 

In [None]:
import torch
import cv2
import torchvision.transforms as transforms
from PIL import Image as pil
import numpy as np
from ultralytics import YOLO
import mediapipe as mp
from monodepth2 import monodepth2

# 初始化 MonoDepth2 模型
md = monodepth2()
encoder = md.encoder  # 載入編碼器
depth_decoder = md.depth_decoder  # 載入深度解碼器

# Mediapipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# 載入 YOLOv8 的預訓練模型
model_dispenser = YOLO('/Users/yangzhelun/Downloads/best.pt')  # 酒精機器模型
model_people = YOLO("yolov8n.pt")  # 官方人物模型

cap = cv2.VideoCapture(0)

def preprocess_image(frame):
    """ 將影像轉換為張量，進行尺寸縮放 """
    input_image = pil.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    input_image = input_image.resize((1024, 384), pil.Resampling.LANCZOS)
    input_image = transforms.ToTensor()(input_image).unsqueeze(0)
    return input_image

def resize_bounding_box(box, original_size, new_size):
    """將 bounding box 根據影像縮放調整大小"""
    x1, y1, x2, y2 = map(int, box)
    orig_w, orig_h = original_size
    new_w, new_h = new_size
    scale_w, scale_h = new_w / orig_w, new_h / orig_h
    return [int(x1 * scale_w), int(y1 * scale_h), int(x2 * scale_w), int(y2 * scale_h)]

def draw_box(frame, box, label, confidence, color):
    x1, y1, x2, y2 = map(int, box.xyxy[0])
    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
    cv2.putText(frame, f'{label}: {confidence:.2f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
    return [x1, y1, x2, y2]

def calculate_average_depth(depth_map, bounding_box):
    """根據 Bounding Box 計算區域內的平均深度"""
    if bounding_box is None:
        return None  # 如果 bounding_box 是 None，返回 None
    x1, y1, x2, y2 = bounding_box
    roi = depth_map[y1:y2, x1:x2]  # 取出 Bounding Box 內的深度區域
    avg_depth = np.mean(roi)  # 計算平均深度
    return avg_depth

def is_person_near_dispenser(person_depth, dispenser_depth, threshold=10):
    """比較人物與酒精機器的深度，判斷是否接近"""
    if person_depth is None or dispenser_depth is None:
        return False  # 如果其中之一是 None，表示無法進行比較
    return abs(person_depth - dispenser_depth) < threshold

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    height, width, _ = frame.shape  # 取得影像的尺寸

    # 使用模型偵測酒精機器
    results_dispenser = model_dispenser(frame)
    
    # 使用模型偵測人物
    results_people = model_people(frame)

    dispenser_box = None
    person_boxes = []  # 用於保存多個人物的框

    # 處理 YOLO 結果
    for result, color, label_name, conf_threshold in [(results_people, (0, 0, 255), 'person', 0.8), (results_dispenser, (0, 255, 0), 'dispenser', 0.3)]:
        for r in result:
            for box in r.boxes:
                label = r.names[int(box.cls)]
                confidence = box.conf[0]  # 獲取信心程度
                if label == label_name and confidence > conf_threshold:
                    if label_name == 'person':
                        person_box = draw_box(frame, box, label, confidence, color)
                        person_boxes.append(person_box)  # 將多個人物框保存
                    elif label_name == 'dispenser':
                        dispenser_box = draw_box(frame, box, label, confidence, color)

    # 深度檢測部分
    input_image = preprocess_image(frame).to(torch.device('cpu'))  # 確保在 CPU 上運行
    with torch.no_grad():
        features = encoder(input_image)
        outputs = depth_decoder(features)

    # 提取深度圖 (disparity map)
    depth_map = outputs[("disp", 0)].squeeze().cpu().numpy()  # 將張量轉換為 NumPy 數組
    depth_map = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX)  # 進行歸一化
    depth_map = np.uint8(depth_map)  # 將數據類型轉換為 uint8

    # 調整 YOLO 檢測到的 bounding box 尺寸，以適應 MonoDepth2 的深度圖大小
    if dispenser_box is not None:
        dispenser_box = resize_bounding_box(dispenser_box, (width, height), (1024, 384))

    # 計算酒精機器的深度
    dispenser_depth = calculate_average_depth(depth_map, dispenser_box)

    # 初始化顯示的狀態文字
    status_text = "No action detected"
    depth_text = ""

    # 處理多個人物的情況
    for person_box in person_boxes:
        person_box = resize_bounding_box(person_box, (width, height), (1024, 384))
        person_depth = calculate_average_depth(depth_map, person_box)

        # 判斷是否偵測到深度，並顯示數值
        if person_depth is not None and dispenser_depth is not None:
            depth_text = f"Person Depth: {person_depth:.2f}, Dispenser Depth: {dispenser_depth:.2f}"
        elif person_depth is not None:
            depth_text = f"Person Depth: {person_depth:.2f}, Dispenser Depth: N/A"
        elif dispenser_depth is not None:
            depth_text = f"Person Depth: N/A, Dispenser Depth: {dispenser_depth:.2f}"
        else:
            depth_text = "Person Depth: N/A, Dispenser Depth: N/A"

        # 判斷人物是否接近酒精機器
        if is_person_near_dispenser(person_depth, dispenser_depth):
            status_text = "Person near the dispenser"
            
            # 檢查鼻子位置是否進行額溫測量
            pose_results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            if pose_results.pose_landmarks:
                nose = pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.NOSE]
                nose_x, nose_y = int(nose.x * width), int(nose.y * height)
                if dispenser_box and (dispenser_box[0] < nose_x < dispenser_box[2]) and (dispenser_box[1] < nose_y < dispenser_box[3]):
                    cv2.putText(frame, "Measuring Forehead Temperature", (50, 150), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 0, 255), 3, cv2.LINE_AA)

        # 顯示狀態文字：左上角提示字體，右下角顯示同一深度信息
        cv2.putText(frame, status_text, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), 3, cv2.LINE_AA)
        cv2.putText(frame, depth_text, (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 0), 2, cv2.LINE_AA)
    
    # 顯示結果
    cv2.imshow('YOLOv8 and Mediapipe - Dispenser Interaction Detection', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

In [None]:
import torch
import cv2
import mediapipe as mp
import math
from ultralytics import YOLO

# 初始化 Mediapipe Pose 和 Hands
mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
pose = mp_pose.Pose()
hands = mp_hands.Hands()

# 載入 YOLOv8 的預訓練模型
model_dispenser = YOLO('/Users/yangzhelun/Downloads/best.pt')  # 酒精機器模型
model_people = YOLO("yolov8n.pt")  # 官方人物模型

cap = cv2.VideoCapture(1)

# 計數器初始化
disinfection_counter = 0

def draw_box(frame, box, label, confidence, color):
    x1, y1, x2, y2 = map(int, box.xyxy[0])
    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
    cv2.putText(frame, f'{label}: {confidence:.2f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
    return [x1, y1, x2, y2]

def create_head_bounding_box(landmarks, width, height, scale_factor=5):
    """根據頭部關鍵點計算bounding box，並放大倍率"""
    nose = landmarks[mp_pose.PoseLandmark.NOSE]
    left_ear = landmarks[mp_pose.PoseLandmark.LEFT_EAR]
    right_ear = landmarks[mp_pose.PoseLandmark.RIGHT_EAR]
    mouth_left = landmarks[mp_pose.PoseLandmark.MOUTH_LEFT]
    mouth_right = landmarks[mp_pose.PoseLandmark.MOUTH_RIGHT]
    
    # 計算頭部的左右邊界
    x1 = min(left_ear.x, nose.x) * width
    x2 = max(right_ear.x, nose.x) * width
    
    # 計算頭部的上下邊界（根據鼻子和嘴的位置推算出下巴位置）
    y1 = min(nose.y, mouth_left.y, mouth_right.y) * height
    y2 = max(nose.y, mouth_left.y, mouth_right.y) * height
    
    # 根據頭部的高度比例調整框的寬度
    head_height = y2 - y1
    box_width = x2 - x1
    expanded_box_width = head_height * 0.6
    x1 = max(0, x1 - (expanded_box_width - box_width) / 2)
    x2 = min(width, x2 + (expanded_box_width - box_width) / 2)
    
    # 放大bounding box 2.5倍
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    new_width = (x2 - x1) * scale_factor
    new_height = (y2 - y1) * scale_factor

    # 擴大框
    x1 = max(0, int(center_x - new_width / 2))
    x2 = min(width, int(center_x + new_width / 2))
    y1 = max(0, int(center_y - new_height / 2))
    y2 = min(height, int(center_y + new_height / 2))

    return [int(x1), int(y1), int(x2), int(y2)]

def calculate_distance(point1, point2):
    return math.sqrt((point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2)

def is_head_near_dispenser(head_box, dispenser_box, distance_threshold=80):
    head_center = ((head_box[0] + head_box[2]) // 2, (head_box[1] + head_box[3]) // 2)
    dispenser_center = ((dispenser_box[0] + dispenser_box[2]) // 2, (dispenser_box[1] + dispenser_box[3]) // 2)
    distance = calculate_distance(head_center, dispenser_center)
    return distance < distance_threshold


def is_hand_near_dispenser(hand_box, dispenser_box, tolerance=50):
    """判斷手部是否靠近酒精機器"""
    hand_x1, hand_y1, hand_x2, hand_y2 = hand_box
    dispenser_x1, dispenser_y1, dispenser_x2, dispenser_y2 = dispenser_box

    # 判斷手部 bounding box 是否靠近酒精機器
    return (
        dispenser_x1 - tolerance < hand_x1 < dispenser_x2 + tolerance and
        dispenser_y2 < hand_y1 < dispenser_y2 + 100  # 手是否在酒精機器的下方
    )

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    height, width, _ = frame.shape

    # 使用模型偵測酒精機器
    results_dispenser = model_dispenser(frame)
    
    # 使用模型偵測人物
    results_people = model_people(frame)

    dispenser_box = None
    person_boxes = []

    # 處理 YOLO 結果
    for result, color, label_name, conf_threshold in [(results_people, (0, 0, 255), 'person', 0.8), (results_dispenser, (0, 255, 0), 'dispenser', 0.3)]:
        for r in result:
            for box in r.boxes:
                label = r.names[int(box.cls)]
                confidence = box.conf[0]
                if label == label_name and confidence > conf_threshold:
                    if label_name == 'person':
                        person_box = draw_box(frame, box, label, confidence, color)
                        person_boxes.append(person_box)
                    elif label_name == 'dispenser':
                        dispenser_box = draw_box(frame, box, label, confidence, color)

    # Mediapipe 偵測人體姿勢，計算頭部的bounding box
    pose_results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    if pose_results.pose_landmarks:
        for person_box in person_boxes:
            # 獲取人體姿勢的landmarks
            landmarks = pose_results.pose_landmarks.landmark
            # 計算頭部的 bounding box
            head_box = create_head_bounding_box(landmarks, width, height)
            # 畫出頭部的bounding box
            cv2.rectangle(frame, (head_box[0], head_box[1]), (head_box[2], head_box[3]), (255, 0, 0), 2)

            # 判斷頭部是否接近酒精機器
            if dispenser_box and is_head_near_dispenser(head_box, dispenser_box):
                cv2.putText(frame, "Person Near Dispenser", (50, 150), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), 3, cv2.LINE_AA)
                
                # 手部偵測
                hand_results = hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                if hand_results.multi_hand_landmarks:
                    for hand_landmarks in hand_results.multi_hand_landmarks:
                        # 取得手的bounding box
                        hand_x_min = min([lm.x for lm in hand_landmarks.landmark]) * width
                        hand_x_max = max([lm.x for lm in hand_landmarks.landmark]) * width
                        hand_y_min = min([lm.y for lm in hand_landmarks.landmark]) * height
                        hand_y_max = max([lm.y for lm in hand_landmarks.landmark]) * height
                        hand_box = [int(hand_x_min), int(hand_y_min), int(hand_x_max), int(hand_y_max)]

                        # 畫出手部的bounding box
                        cv2.rectangle(frame, (hand_box[0], hand_box[1]), (hand_box[2], hand_box[3]), (0, 255, 255), 2)

                        # 判斷手是否靠近酒精機器
                        if is_hand_near_dispenser(hand_box, dispenser_box):
                            cv2.putText(frame, "Disinfection Action Detected", (50, 200), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 0, 0), 3, cv2.LINE_AA)
                            disinfection_counter += 1

    # 顯示右下角的消毒次數
    # cv2.putText(frame, f"Disinfection Count: {disinfection_counter}", (width - 350, height - 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA)

    # 顯示結果
    cv2.imshow('Dispenser Interaction Detection', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



0: 384x640 (no detections), 66.9ms
Speed: 4.7ms preprocess, 66.9ms inference, 6.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 48.5ms
Speed: 1.5ms preprocess, 48.5ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 56.1ms
Speed: 1.4ms preprocess, 56.1ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 46.7ms
Speed: 1.1ms preprocess, 46.7ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 49.5ms
Speed: 1.2ms preprocess, 49.5ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 46.6ms
Speed: 1.5ms preprocess, 46.6ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 43.5ms
Speed: 1.6ms preprocess, 43.5ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 50.0ms
Speed: 1.4ms preprocess, 

Speed: 1.4ms preprocess, 43.0ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.9ms
Speed: 1.1ms preprocess, 40.9ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 43.2ms
Speed: 1.6ms preprocess, 43.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 38.1ms
Speed: 1.3ms preprocess, 38.1ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 40.4ms
Speed: 1.1ms preprocess, 40.4ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 41.8ms
Speed: 1.1ms preprocess, 41.8ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 37.1ms
Speed: 1.2ms preprocess, 37.1ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 36.9ms
Speed: 1.3ms preprocess, 36.9ms inference, 0.2ms postprocess pe

Speed: 1.2ms preprocess, 37.3ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 36.9ms
Speed: 1.3ms preprocess, 36.9ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 37.0ms
Speed: 1.3ms preprocess, 37.0ms inference, 0.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 4 books, 40.4ms
Speed: 1.2ms preprocess, 40.4ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.7ms
Speed: 1.2ms preprocess, 39.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 3 books, 39.7ms
Speed: 1.2ms preprocess, 39.7ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 42.0ms
Speed: 1.4ms preprocess, 42.0ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 5 books, 38.2ms
Speed: 1.2ms preprocess, 38.2ms inference, 0.4ms postprocess 


0: 384x640 1 person, 1 chair, 2 books, 41.8ms
Speed: 1.3ms preprocess, 41.8ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 47.0ms
Speed: 1.4ms preprocess, 47.0ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 3 books, 38.6ms
Speed: 1.3ms preprocess, 38.6ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 44.7ms
Speed: 1.2ms preprocess, 44.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 3 books, 40.6ms
Speed: 1.1ms preprocess, 40.6ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.4ms
Speed: 1.2ms preprocess, 39.4ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 3 books, 41.2ms
Speed: 1.2ms preprocess, 41.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detection

Speed: 1.5ms preprocess, 38.9ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 5 books, 51.9ms
Speed: 1.4ms preprocess, 51.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 38.3ms
Speed: 1.1ms preprocess, 38.3ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 4 books, 40.6ms
Speed: 1.3ms preprocess, 40.6ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.3ms
Speed: 1.4ms preprocess, 39.3ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 5 books, 41.2ms
Speed: 1.2ms preprocess, 41.2ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.3ms
Speed: 1.5ms preprocess, 40.3ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 5 books, 39.7ms
Speed: 1.3ms preprocess, 


0: 384x640 1 person, 1 chair, 5 books, 43.5ms
Speed: 1.1ms preprocess, 43.5ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 38.6ms
Speed: 1.1ms preprocess, 38.6ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 1 remote, 3 books, 36.1ms
Speed: 1.2ms preprocess, 36.1ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.8ms
Speed: 1.2ms preprocess, 39.8ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 1 remote, 2 books, 38.9ms
Speed: 1.3ms preprocess, 38.9ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 36.8ms
Speed: 1.2ms preprocess, 36.8ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 2 remotes, 5 books, 41.5ms
Speed: 1.4ms preprocess, 41.5ms inference, 0.3ms postprocess per image at shape (1, 3, 384,


0: 384x640 1 person, 5 books, 39.5ms
Speed: 1.5ms preprocess, 39.5ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 37.5ms
Speed: 1.1ms preprocess, 37.5ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 5 books, 37.7ms
Speed: 1.2ms preprocess, 37.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 36.9ms
Speed: 1.2ms preprocess, 36.9ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 5 books, 37.8ms
Speed: 1.3ms preprocess, 37.8ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 37.2ms
Speed: 1.1ms preprocess, 37.2ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 4 books, 39.2ms
Speed: 1.3ms preprocess, 39.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 38.1ms
Speed: 1.3ms preprocess


0: 384x640 (no detections), 35.6ms
Speed: 1.1ms preprocess, 35.6ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 43.0ms
Speed: 1.4ms preprocess, 43.0ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.1ms
Speed: 1.1ms preprocess, 39.1ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 8 books, 35.9ms
Speed: 1.3ms preprocess, 35.9ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 38.9ms
Speed: 1.3ms preprocess, 38.9ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 cell phone, 7 books, 41.9ms
Speed: 1.4ms preprocess, 41.9ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.9ms
Speed: 1.5ms preprocess, 40.9ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 8 books, 39.2ms
Speed: 1.5

Speed: 1.1ms preprocess, 40.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 remote, 4 books, 39.8ms
Speed: 1.5ms preprocess, 39.8ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 36.3ms
Speed: 1.4ms preprocess, 36.3ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 remote, 7 books, 44.5ms
Speed: 1.2ms preprocess, 44.5ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 41.8ms
Speed: 1.1ms preprocess, 41.8ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 remote, 4 books, 38.2ms
Speed: 1.5ms preprocess, 38.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 41.9ms
Speed: 1.4ms preprocess, 41.9ms inference, 0.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 4 books, 43.4ms
Speed: 1.2ms preprocess, 43.4


0: 384x640 1 person, 11 books, 42.1ms
Speed: 1.4ms preprocess, 42.1ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.3ms
Speed: 1.1ms preprocess, 39.3ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 8 books, 38.0ms
Speed: 1.3ms preprocess, 38.0ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.9ms
Speed: 1.1ms preprocess, 40.9ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 39.1ms
Speed: 1.2ms preprocess, 39.1ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.9ms
Speed: 1.0ms preprocess, 39.9ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 10 books, 37.7ms
Speed: 1.3ms preprocess, 37.7ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.3ms
Speed: 1.5ms p

0: 384x640 1 person, 4 books, 41.1ms
Speed: 1.3ms preprocess, 41.1ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 35.7ms
Speed: 1.1ms preprocess, 35.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 5 books, 36.2ms
Speed: 1.3ms preprocess, 36.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.7ms
Speed: 1.3ms preprocess, 40.7ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 4 books, 39.6ms
Speed: 1.3ms preprocess, 39.6ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 36.8ms
Speed: 1.0ms preprocess, 36.8ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 5 books, 44.9ms
Speed: 1.1ms preprocess, 44.9ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 38.6ms
Speed: 1.5ms preprocess, 3


0: 384x640 (no detections), 41.6ms
Speed: 1.1ms preprocess, 41.6ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 41.6ms
Speed: 1.4ms preprocess, 41.6ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 38.4ms
Speed: 1.3ms preprocess, 38.4ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 41.2ms
Speed: 1.2ms preprocess, 41.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 41.9ms
Speed: 1.0ms preprocess, 41.9ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 8 books, 43.9ms
Speed: 1.3ms preprocess, 43.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.8ms
Speed: 1.3ms preprocess, 39.8ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 8 books, 37.8ms
Speed: 1.3ms preprocess, 

Speed: 1.6ms preprocess, 45.6ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 41.4ms
Speed: 1.2ms preprocess, 41.4ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 8 books, 83.0ms
Speed: 1.2ms preprocess, 83.0ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 47.1ms
Speed: 1.4ms preprocess, 47.1ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 8 books, 45.3ms
Speed: 1.7ms preprocess, 45.3ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.1ms
Speed: 1.3ms preprocess, 40.1ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 47.8ms
Speed: 1.1ms preprocess, 47.8ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 39.7ms
Speed: 1.2ms preprocess, 39.7ms inference, 0.2ms postprocess pe


0: 384x640 1 person, 7 books, 41.2ms
Speed: 1.3ms preprocess, 41.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)


# 利用水壺當作酒精機器並劃分額溫區以及噴灑區

In [None]:
import cv2
import mediapipe as mp
from ultralytics import YOLO

# 初始化 YOLOv8 模型 (使用預訓練模型 yolov8n.pt)
model = YOLO("yolov8n.pt")

# Mediapipe 初始化
mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
pose = mp_pose.Pose()
hands = mp_hands.Hands()
mp_draw = mp.solutions.drawing_utils

# 開啟攝像頭
cap = cv2.VideoCapture(1)

# 偵測範圍與統計
temp_check_count = 0
sanitize_count = 0

def detect_water_bottle_and_people(results, frame_width, frame_height):
    """解析 YOLOv8 結果並返回水壺和人類的邊界框"""
    bottle_boxes = []
    person_boxes = []
    for result in results:
        for box in result.boxes:
            cls = int(box.cls[0])  # 類別
            x1, y1, x2, y2 = box.xyxy[0]  # 邊界框座標
            conf = box.conf[0]  # 信心值
            # 類別對應: 'bottle' -> 水壺, 'person' -> 人類
            if conf > 0.5:  # 設置信心值閾值
                if cls == 84:  # 'cup' 的 COCO 標籤
                    bottle_boxes.append((x1 / frame_width, y1 / frame_height, x2 / frame_width, y2 / frame_height))
                elif cls == 0:  # 'person' 的 COCO 標籤
                    person_boxes.append((x1 / frame_width, y1 / frame_height, x2 / frame_width, y2 / frame_height))
    return bottle_boxes, person_boxes

def is_head_near_bottle(head, bottle_boxes, frame_width, frame_height):
    """判斷頭部是否靠近水壺（額溫區域）"""
    for box in bottle_boxes:
        x1, y1, x2, y2 = box
        # 假設額溫檢測範圍在水壺的上 30% 區域
        temp_check_top = y1
        temp_check_bottom = y1 + 0.3 * (y2 - y1)
        if x1 <= head.x <= x2 and temp_check_top <= head.y <= temp_check_bottom:
            return True
    return False

def is_hand_near_bottle(hand_landmarks, bottle_boxes, frame_width, frame_height):
    """判斷手是否在水壺（消毒區域）周圍"""
    for box in bottle_boxes:
        x1, y1, x2, y2 = box
        # 假設消毒區域在水壺的下 50% 區域
        sanitize_top = y1 + 0.5 * (y2 - y1)
        sanitize_bottom = y2
        for lm in hand_landmarks.landmark:
            hand_x = lm.x
            hand_y = lm.y
            if x1 <= hand_x <= x2 and sanitize_top <= hand_y <= sanitize_bottom:
                return True
    return False

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("無法讀取攝像頭畫面")
        break

    frame = cv2.flip(frame, 1)  # 翻轉畫面
    h, w, _ = frame.shape

    # YOLOv8 偵測
    results = model(frame)
    bottle_boxes, person_boxes = detect_water_bottle_and_people(results, w, h)

    # 繪製邊界框
    for box in bottle_boxes:
        x1, y1, x2, y2 = (int(box[0] * w), int(box[1] * h), int(box[2] * w), int(box[3] * h))
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, "Bottle", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    for box in person_boxes:
        x1, y1, x2, y2 = (int(box[0] * w), int(box[1] * h), int(box[2] * w), int(box[3] * h))
        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
        cv2.putText(frame, "Person", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

    # Mediapipe 額溫檢測 (頭部是否在 Temp Check Area)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pose_results = pose.process(rgb_frame)
    if pose_results.pose_landmarks:
        landmarks = pose_results.pose_landmarks.landmark
        head = landmarks[mp_pose.PoseLandmark.NOSE.value]  # 使用鼻子作為頭部位置
        if is_head_near_bottle(head, bottle_boxes, w, h):
            temp_check_count += 1
            cv2.putText(frame, "Temperature Check Detected!", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Mediapipe 手部消毒檢測 (手部是否靠近消毒區域)
    hand_results = hands.process(rgb_frame)
    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            if is_hand_near_bottle(hand_landmarks, bottle_boxes, w, h):
                sanitize_count += 1
                cv2.putText(frame, "Sanitize Detected!", (10, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

    # 顯示計數
    cv2.putText(frame, f"Temperature Checks: {temp_check_count}", (10, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.putText(frame, f"Sanitizations: {sanitize_count}", (10, 200), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

    # 顯示畫面
    cv2.imshow("YOLOv8 + Mediapipe Integration", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



INFO: Created TensorFlow Lite XNNPACK delegate for CPU.



0: 384x640 1 person, 9 books, 48.9ms
Speed: 1.8ms preprocess, 48.9ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 55.1ms
Speed: 1.8ms preprocess, 55.1ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 56.7ms
Speed: 2.3ms preprocess, 56.7ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 69.6ms
Speed: 7.9ms preprocess, 69.6ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 59.3ms
Speed: 1.6ms preprocess, 59.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 55.4ms
Speed: 1.2ms preprocess, 55.4ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 10 books, 55.7ms
Speed: 1.4ms preprocess, 55.7ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 9 books, 60.7ms
Speed: 2.3ms pre


0: 384x640 2 persons, 11 books, 43.0ms
Speed: 1.8ms preprocess, 43.0ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 suitcase, 3 books, 41.4ms
Speed: 1.1ms preprocess, 41.4ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 43.9ms
Speed: 1.2ms preprocess, 43.9ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 6 books, 39.8ms
Speed: 1.3ms preprocess, 39.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 4 books, 41.8ms
Speed: 1.4ms preprocess, 41.8ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 4 books, 43.3ms
Speed: 1.6ms preprocess, 43.3ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 books, 43.4ms
Speed: 1.2ms preprocess, 43.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 2 books, 60.6ms
Speed:


0: 384x640 1 person, 1 refrigerator, 10 books, 43.5ms
Speed: 1.2ms preprocess, 43.5ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 11 books, 48.3ms
Speed: 1.1ms preprocess, 48.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 11 books, 46.2ms
Speed: 1.3ms preprocess, 46.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 11 books, 43.4ms
Speed: 1.1ms preprocess, 43.4ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 10 books, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 refrigerator, 11 books, 42.4ms
Speed: 1.2ms preprocess, 42.4ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 12 books, 45.6ms
Speed: 1.3ms preprocess, 45.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 pe


0: 384x640 1 person, 12 books, 38.6ms
Speed: 1.2ms preprocess, 38.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 refrigerator, 12 books, 46.9ms
Speed: 1.4ms preprocess, 46.9ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 12 books, 44.4ms
Speed: 1.0ms preprocess, 44.4ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 12 books, 43.4ms
Speed: 1.3ms preprocess, 43.4ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 12 books, 41.2ms
Speed: 1.0ms preprocess, 41.2ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 12 books, 43.9ms
Speed: 1.3ms preprocess, 43.9ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 11 books, 45.1ms
Speed: 1.1ms preprocess, 45.1ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 11 books, 

0: 384x640 1 person, 5 books, 47.5ms
Speed: 1.6ms preprocess, 47.5ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 bottle, 5 books, 41.2ms
Speed: 1.0ms preprocess, 41.2ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 2 books, 47.4ms
Speed: 1.9ms preprocess, 47.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 book, 45.4ms
Speed: 1.2ms preprocess, 45.4ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 book, 51.9ms
Speed: 1.2ms preprocess, 51.9ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 book, 47.9ms
Speed: 1.2ms preprocess, 47.9ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 book, 42.6ms
Speed: 1.0ms preprocess, 42.6ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 book, 40.4ms
Speed: 1.1ms 


0: 384x640 1 person, 6 books, 50.0ms
Speed: 1.1ms preprocess, 50.0ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 6 books, 45.2ms
Speed: 1.1ms preprocess, 45.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 52.6ms
Speed: 1.1ms preprocess, 52.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 7 books, 50.7ms
Speed: 1.1ms preprocess, 50.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 6 books, 49.0ms
Speed: 1.2ms preprocess, 49.0ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 6 books, 44.7ms
Speed: 1.1ms preprocess, 44.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 6 books, 44.6ms
Speed: 1.2ms preprocess, 44.6ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 6 books, 45.5ms
Speed: 1.5ms prep

In [1]:
import cv2
from ultralytics import YOLO
import mediapipe as mp
import time

# Mediapipe 初始化
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.3, min_tracking_confidence=0.3)
mp_draw = mp.solutions.drawing_utils

# 初始化 YOLO 模型
yolo_dispenser_model_path = "/Users/yangzhelun/Desktop/國衛院專案/dispensor weight/best.pt"
model_people = YOLO("yolov8n.pt")  # 官方人物模型
model_dispenser = YOLO(yolo_dispenser_model_path)

# 打开视频文件
video_path = "/Users/yangzhelun/Downloads/IMG_7129.mov"
cap = cv2.VideoCapture(video_path)

# 视频信息
fps = int(cap.get(cv2.CAP_PROP_FPS))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# 初始化 VideoWriter
output_path = "/Users/yangzhelun/Downloads/processed_video.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

dispenser_roi = None
sanitized_count = 0
sanitized_ids = set()  # 已经消毒的 track_id
track_state = {}  # 每个 track_id 的状态

# 检测参数
INTERSECTION_THRESHOLD = 0.1
DELAY_TIME = 3  # 提示文字显示时间

# 面积计算函数
def calculate_intersection_area(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    return max(0, x2 - x1) * max(0, y2 - y1)

def calculate_area(box):
    return max(0, box[2] - box[0]) * max(0, box[3] - box[1])

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("处理完成")
        break

    h, w, _ = frame.shape
    current_time = time.time()

    # 如果未检测到酒精机，持续尝试
    if dispenser_roi is None:
        results_dispenser = model_dispenser(frame)
        for r in results_dispenser:
            for box in r.boxes:
                label = r.names[int(box.cls)]
                confidence = box.conf[0]
                if label == 'dispenser' and confidence > 0.3:
                    x1, y1, x2, y2 = map(int, box.xyxy[0])
                    dispenser_roi = (x1, y1, x2, y2)
                    break
        continue  # 未检测到直接跳过到下一帧

    # 使用 YOLOv8 进行跟踪
    results_people = model_people.track(frame, persist=True, tracker="botsort.yaml")

    # 遍历检测到的对象
    for r in results_people:
        for box, track_id_tensor in zip(r.boxes, r.boxes.id):
            track_id = int(track_id_tensor.item())  # 确保 track_id 是 int 类型
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            person_box = (x1, y1, x2, y2)

            # 初始化 track_state
            if track_id not in track_state:
                track_state[track_id] = {'last_detected': 0, 'show_text': False}
                print(f"[LOG] Initialized Track ID: {track_id}")

            print(f"[LOG] Processing Track ID: {track_id}, Box: {person_box}")

            # 检查与酒精机的重叠
            if dispenser_roi:
                intersection_area = calculate_intersection_area(person_box, dispenser_roi)
                overlap_ratio = intersection_area / min(calculate_area(person_box), calculate_area(dispenser_roi))

                print(f"[LOG] Overlap Ratio for Track ID {track_id}: {overlap_ratio}")

                if overlap_ratio > INTERSECTION_THRESHOLD:
                    print(f"[LOG] Track ID {track_id} is near the dispenser.")
                    # 如果此 ID 未记录或延迟时间已过
                    if (current_time - track_state[track_id]['last_detected']) > DELAY_TIME:
                        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        hand_results = hands.process(rgb_frame)

                        if hand_results.multi_hand_landmarks:
                            for hand_landmarks in hand_results.multi_hand_landmarks:
                                mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                                # 检测手部位置
                                lower_edge_start = dispenser_roi[1] + int((dispenser_roi[3] - dispenser_roi[1]) * 0.8)
                                lower_edge_end = dispenser_roi[3]
                                for lm in hand_landmarks.landmark:
                                    hand_x = int(lm.x * w)
                                    hand_y = int(lm.y * h)
                                    if (dispenser_roi[0] <= hand_x <= dispenser_roi[2] and
                                            lower_edge_start <= hand_y <= lower_edge_end):
                                        # 加入 ID 并增加计数
                                        if track_id not in sanitized_ids:
                                            sanitized_ids.add(track_id)
                                            sanitized_count += 1
                                            print(f"[LOG] Sanitization Detected for Track ID: {track_id}, Total Count: {sanitized_count}")
                                        # 更新 track_state
                                        track_state[track_id] = {
                                            'last_detected': current_time,
                                            'show_text': True
                                        }

            # 提示文字
            if track_state[track_id]['show_text'] and current_time - track_state[track_id]['last_detected'] <= DELAY_TIME:
                cv2.putText(frame, "Sanitization Success!!!", (w // 4, h // 2),
                            cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 255), 5, cv2.LINE_AA)
            elif current_time - track_state[track_id]['last_detected'] > DELAY_TIME:
                track_state[track_id]['show_text'] = False

            # 绘制追踪框
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"ID: {track_id}", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # 确保每帧都写入计数字样
    cv2.putText(frame, f"Sanitized Count: {sanitized_count}", (10, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 5)

    # 写入视频
    out.write(frame)
    cv2.imshow("YOLOv8 + BoT-SORT + Mediapipe", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
out.release()
cv2.destroyAllWindows()

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
100%|███████████████████████████████████████| 6.25M/6.25M [00:49<00:00, 133kB/s]


FileNotFoundError: [Errno 2] No such file or directory: '/Users/yangzhelun/Desktop/國衛院專案/dispensor weight/best.pt'

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
2024-11-20 14:23:17.817 Python[45277:3559692] _TIPropertyValueIsValid called with 4 on nil context!
2024-11-20 14:23:17.817 Python[45277:3559692] imkxpc_getApplicationProperty:reply: called with incorrect property value 4, bailing.
2024-11-20 14:23:17.817 Python[45277:3559692] Text input context does not respond to _valueForTIProperty:
2024-11-20 14:23:17.818 Python[45277:3559692] _TIPropertyValueIsValid called with 4 on nil context!
2024-11-20 14:23:17.818 Python[45277:3559692] imkxpc_getApplicationProperty:reply: called with incorrect property value 4, bailing.
2024-11-20 14:23:17.818 Python[45277:3559692] Text input context does not respond to _valueForTIProperty:
2024-11-20 14:23:17.829 Python[45277:3559692] _TIPropertyValueIsValid called with 4 on nil context!
2024-11-20 14:23:17.829 Python[45277:3559692] imkxpc_getApplicationProperty:reply: called with incorrect property value 4, bailing.
2024-11-20 14:23:17.829 Python[45277