# 检测潜水运动员位置 

通过仿射变换得出运动员真实位移速度

In [1]:
from ultralytics import YOLO

# 加载预训练模型
model = YOLO("yolo11n.pt")

In [None]:
from ultralytics import YOLO

# 加载预训练模型
model = YOLO("yolov8n.pt")

# 训练，减少 workers
model.train(
    data="new_dataset/data.yaml",
    epochs=60,
    imgsz=416,
    batch=8,
    device=0,
    workers=2,  # 减少工作线程
    name="swimmer_train",
    project="runs/train",
    exist_ok=True,
)

### 图片预测

In [None]:
from ultralytics import YOLO
import cv2

# 加载训练好的模型
model = YOLO("runs/train/swimmer_train/weights/best.pt")

# 测试多张图像
results = model.predict(
    source="new_dataset/test/images",  # 测试集目录
    device=0,
    iou=0.5,
    conf=0.25,
    save=True,
    save_dir="new_dataset/predict",
)

# 显示单张结果
for result in results:
    img = result.plot()  # 带边界框的图像
    cv2.imshow("Result", img)
    cv2.waitKey(0)
cv2.destroyAllWindows()

### 视频预测

In [3]:
from ultralytics import YOLO

# 加载训练好的模型
model = YOLO("runs/train/swimmer_train/weights/best.pt")

# 视频文件路径
video_path = "free_dive_video/007_clipped.mp4"

# 进行视频预测
results = model.predict(
    source=video_path,
    device=0,  # 使用 GPU
    conf=0.25,  # 置信度阈值
    iou=0.5,  # IOU 阈值，减少框重叠
    save=True,  # 保存结果视频
    save_dir="runs/detect/predict",  # 保存路径
)

# # 可选：实时显示视频结果
# cap = cv2.VideoCapture(video_path)
# while cap.isOpened():
#     ret, frame = cap.read()
#     if not ret:
#         break
#     # 预测单帧
#     result = model(frame, device=0, conf=0.25, iou=0.5)
#     annotated_frame = result[0].plot()  # 绘制边界框
#     cv2.imshow("YOLOv8 Prediction", annotated_frame)
#     if cv2.waitKey(1) & 0xFF == ord("q"):
#         break
# cap.release()
# cv2.destroyAllWindows()


inference results will accumulate in RAM unless `stream=True` is passed, causing potential out-of-memory
errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/3660) e:\\\SMT\free_swim\free_dive_video\007_clipped.mp4: 416x416 (no detections), 5.2ms
video 1/1 (frame 2/3660) e:\\\SMT\free_swim\free_dive_video\007_clipped.mp4: 416x416 (no detections), 4.9ms
video 1/1 (frame 3/3660) e:\\\SMT\free_swim\free_dive_video\007_clipped.mp4: 416x416 (no detections), 6.5ms
video 1/1 (frame 4/3660) e:\\\SMT\free_swim\free_dive_video\007_clipped.mp4: 416x416 (no detections), 6.0ms
video 1/1 (frame 5/3660) e:\\\SMT\fr

### pt 转 onnx

In [2]:
from ultralytics import YOLO

# 加载训练好的模型
model = YOLO("epoch_200.pt")


# Export the model to ONNX format
model.export(format="onnx")  # creates 'yolo11n.onnx'

Ultralytics 8.3.169  Python-3.10.11 torch-2.0.0+cu118 CPU (12th Gen Intel Core(TM) i7-12700H)
YOLO11n summary (fused): 100 layers, 2,582,542 parameters, 0 gradients, 6.3 GFLOPs

[34m[1mPyTorch:[0m starting from 'epoch_200.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 6, 8400) (5.2 MB)

[34m[1mONNX:[0m starting export with onnx 1.17.0 opset 17...
verbose: False, log level: Level.ERROR

[34m[1mONNX:[0m slimming with onnxslim 0.1.61...
[34m[1mONNX:[0m export success  3.2s, saved as 'epoch_200.onnx' (10.0 MB)

Export complete (3.6s)
Results saved to [1mE:\\\SMT\free_swim[0m
Predict:         yolo predict task=detect model=epoch_200.onnx imgsz=640  
Validate:        yolo val task=detect model=epoch_200.onnx imgsz=640 data=other_dataset/data.yaml  
Visualize:       https://netron.app


'epoch_200.onnx'

In [4]:
import onnxruntime
import numpy as np
import cv2
import os
from pathlib import Path


def preprocess_image(image_path, input_size=(416, 416)):
    """预处理图像：读取、调整大小、归一化"""
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"无法读取图像: {image_path}")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # 调整大小，保持比例
    img_resized, ratio, (dw, dh) = letterbox(img, new_shape=input_size)
    # 调试：检查调整后的图像尺寸
    print(f"预处理后图像尺寸: {img_resized.shape}")
    # 归一化
    img_resized = img_resized.astype(np.float32) / 255.0
    # 转换为 NCHW 格式 (1, C, H, W)
    img_resized = img_resized.transpose(2, 0, 1)
    img_resized = np.expand_dims(img_resized, axis=0)
    # 调试：检查最终输入尺寸
    print(f"模型输入尺寸: {img_resized.shape}")
    return img_resized, ratio, (dw, dh)


def letterbox(
    img,
    new_shape=(416, 416),
    color=(114, 114, 114),
    auto=False,
    scaleFill=False,
    scaleup=True,
):
    """调整图像大小并填充，保持比例，确保输出为 new_shape"""
    shape = img.shape[:2]  # 当前形状 [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # 计算缩放比例
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # 不放大
        r = min(r, 1.0)

    # 计算调整后的尺寸
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    # 确保填充后尺寸精确匹配 new_shape
    if auto:  # 最小矩形（避免填充过多）
        dw, dh = np.mod(dw, 32), np.mod(dh, 32)

    dw /= 2  # 分割填充
    dh /= 2

    if shape[::-1] != new_unpad:  # 调整大小
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)

    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    img = cv2.copyMakeBorder(
        img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
    )  # 添加边框

    # 验证最终尺寸
    if img.shape[:2] != new_shape:
        img = cv2.resize(img, new_shape, interpolation=cv2.INTER_LINEAR)

    return img, (r, r), (dw, dh)


def non_max_suppression(
    prediction, conf_thres=0.01, iou_thres=0.45, max_det=300, single_class=True
):
    """执行非极大值抑制 (NMS)，适配单类别模型，使用 cv2.dnn.NMSBoxes"""
    # prediction: [batch, num_boxes, 5] (x, y, w, h, conf)
    max_wh = 4096  # 最大框宽高限制

    output = [None] * prediction.shape[0]
    for xi, x in enumerate(prediction):  # 逐批次处理
        print(f"处理批次 {xi}, 初始框数量: {x.shape[0]}")
        # 过滤低置信度框
        scores = x[:, 4]  # 置信度
        xc = scores > conf_thres
        x = x[xc]
        scores = scores[xc]
        if not x.shape[0]:
            print(f"警告: 图像 {xi} 无检测框（置信度 > {conf_thres}）")
            continue

        print(f"置信度过滤后框数量: {x.shape[0]}")
        print(f"前5个框置信度: {scores[:5]}")  # 调试置信度

        # 提取坐标
        boxes = x[:, :4]  # x, y, w, h

        # 对于单类别模型，类别固定为 0
        if single_class:
            classes = np.zeros(x.shape[0], dtype=np.int32)  # 单类别，cls=0
        else:
            if x.shape[1] <= 5:
                print(f"错误: 模型输出缺少类别信息，形状为 {x.shape}")
                continue
            classes = np.argmax(x[:, 5:], axis=1)

        # 转换为 (x1, y1, x2, y2) 格式以便计算 IoU
        boxes_xyxy = np.zeros_like(boxes)
        boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2  # x1 = x - w/2
        boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2  # y1 = y - h/2
        boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2  # x2 = x + w/2
        boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2  # y2 = y + h/2

        # 过滤无效框
        valid = (
            (boxes_xyxy[:, 2] > boxes_xyxy[:, 0])
            & (boxes_xyxy[:, 3] > boxes_xyxy[:, 1])
            & (boxes_xyxy[:, 2] - boxes_xyxy[:, 0] < max_wh)
            & (boxes_xyxy[:, 3] - boxes_xyxy[:, 1] < max_wh)
        )
        boxes_xyxy = boxes_xyxy[valid]
        scores = scores[valid]
        classes = classes[valid]

        if not boxes_xyxy.shape[0]:
            print(f"警告: 图像 {xi} 无有效框")
            continue

        print(f"有效框过滤后数量: {boxes_xyxy.shape[0]}")
        print(f"前5个框坐标: {boxes_xyxy[:5]}")  # 调试框坐标

        # 使用 cv2.dnn.NMSBoxes 进行 NMS
        boxes_list = boxes_xyxy.astype(np.float32).tolist()  # 转换为 list
        scores_list = scores.tolist()
        indices = cv2.dnn.NMSBoxes(boxes_list, scores_list, conf_thres, iou_thres)

        if len(indices) == 0:
            print(f"警告: 图像 {xi} NMS 后无框")
            continue

        # 限制最大检测框数量
        if len(indices) > max_det:
            indices = indices[:max_det]

        print(f"NMS 后框数量: {len(indices)}")

        # 格式化输出: [x1, y1, x2, y2, conf, cls]
        det = np.concatenate(
            [boxes_xyxy[indices], scores[indices][:, None], classes[indices][:, None]],
            axis=1,
        )
        output[xi] = det

    return output


def postprocess(outputs, conf_thres=0.01, iou_thres=0.45, is_yolov8=True):
    """后处理：NMS 非极大值抑制"""
    print(f"原始输出形状: {outputs.shape}")
    if is_yolov8:
        outputs = np.transpose(outputs, (0, 2, 1))  # 转换为 [batch, num_boxes, 5]
    print(f"后处理后形状: {outputs.shape}")
    detections = non_max_suppression(
        outputs, conf_thres=conf_thres, iou_thres=iou_thres, single_class=True
    )
    return detections


def draw_boxes(image, detections, class_names, ratio, dwdh):
    """在图像上绘制检测框"""
    img = image.copy()
    for det in detections:
        if det is None or len(det) == 0:
            print("无检测框可绘制")
            continue
        print(f"绘制框数量: {len(det)}")
        for *xyxy, conf, cls in det:
            # 调整坐标到原始图像尺寸
            x1, y1, x2, y2 = xyxy
            x1 = int((x1 - dwdh[0]) / ratio[0])
            y1 = int((y1 - dwdh[1]) / ratio[1])
            x2 = int((x2 - dwdh[0]) / ratio[0])
            y2 = int((y2 - dwdh[1]) / ratio[1])
            # 绘制框
            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
            label = f"{class_names[int(cls)]}: {conf:.2f}"
            cv2.putText(
                img, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2
            )
    return img


def main():
    # 配置参数
    onnx_model_path = "runs/train/swimmer_train/weights/best.onnx"  # ONNX 模型路径
    test_dir = "new_dataset/test/images"  # 测试集目录
    input_size = (416, 416)  # 模型输入尺寸
    class_names = ["swimmer"]  # 类别名称
    conf_thres = 0.01  # 置信度阈值（降低以保留更多框）
    iou_thres = 0.45  # NMS IoU 阈值
    output_dir = "output"  # 保存结果的目录
    is_yolov8 = True  # YOLOv8 格式

    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 加载 ONNX 模型
    session = onnxruntime.InferenceSession(
        onnx_model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
    )

    # 打印模型输入信息
    print("模型输入信息:")
    for input in session.get_inputs():
        print(f"Input name: {input.name}, Shape: {input.shape}, Type: {input.type}")

    # 获取测试集图像
    image_paths = list(Path(test_dir).glob("*.jpg")) + list(
        Path(test_dir).glob("*.png")
    )

    for image_path in image_paths:
        print(f"处理图像: {image_path}")
        # 预处理图像
        img_input, ratio, dwdh = preprocess_image(str(image_path), input_size)

        # 运行推理
        inputs = {session.get_inputs()[0].name: img_input}
        outputs = session.run(None, inputs)[0]
        print(f"模型输出形状: {outputs.shape}")

        # 后处理
        detections = postprocess(outputs, conf_thres, iou_thres, is_yolov8=is_yolov8)

        # 读取原始图像以绘制结果
        img = cv2.imread(str(image_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # 绘制检测框
        img_result = draw_boxes(img, detections, class_names, ratio, dwdh)

        # 保存结果
        output_path = os.path.join(output_dir, f"result_{image_path.name}")
        img_result = cv2.cvtColor(img_result, cv2.COLOR_RGB2BGR)
        cv2.imwrite(output_path, img_result)
        print(f"结果已保存至: {output_path}")


if __name__ == "__main__":
    main()

模型输入信息:
Input name: images, Shape: [1, 3, 416, 416], Type: tensor(float)
处理图像: new_dataset\test\images\004_jpg.rf.9e4f53342583d3fe266f01ac72d252c2.jpg
预处理后图像尺寸: (416, 416, 3)
模型输入尺寸: (1, 3, 416, 416)
模型输出形状: (1, 5, 3549)
原始输出形状: (1, 5, 3549)
后处理后形状: (1, 3549, 5)
处理批次 0, 初始框数量: 3549
置信度过滤后框数量: 88
前5个框置信度: [0.02231184 0.09183019 0.17360815 0.06721133 0.01284987]
有效框过滤后数量: 88
前5个框坐标: [[313.7717  154.03264 349.4476  159.68196]
 [325.55164 154.3086  349.2279  159.78552]
 [328.14764 154.2583  349.388   159.78693]
 [327.48035 154.31747 349.63702 159.53506]
 [356.2622  187.35866 391.39795 198.04848]]
NMS 后框数量: 9
绘制框数量: 9
结果已保存至: output\result_004_jpg.rf.9e4f53342583d3fe266f01ac72d252c2.jpg
处理图像: new_dataset\test\images\016_jpg.rf.d766dd4b0bc8dc293f50903198a61155.jpg
预处理后图像尺寸: (416, 416, 3)
模型输入尺寸: (1, 3, 416, 416)
模型输出形状: (1, 5, 3549)
原始输出形状: (1, 5, 3549)
后处理后形状: (1, 3549, 5)
处理批次 0, 初始框数量: 3549
置信度过滤后框数量: 96
前5个框置信度: [0.01186833 0.09125158 0.1599336  0.16424131 0.02233437]
有效框过滤后数量: 96
前5个框坐标