In [None]:
import json
from copy import deepcopy
from pathlib import Path

# ===== 用户配置 =====
ROOT = Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/03_16mp_fiftyone_dataset/ms1_0605-0621_40_ok")          # 数据集根目录（改成你的）
ROOT = Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/03_16mp_fiftyone_dataset/ms1_0710-0726_36_ok")          # 数据集根目录（改成你的）
ROOT = Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/03_16mp_fiftyone_dataset/ms1_0726-0809_11_ok")          # 数据集根目录（改成你的）
ROOT = Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/03_16mp_fiftyone_dataset/ms1_0809-0823_34_ok")          # 数据集根目录（改成你的）
ROOT = Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/03_16mp_fiftyone_dataset/ms2_0726-0809_13_ok")          # 数据集根目录（改成你的）
ROOT = Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/03_16mp_fiftyone_dataset/ms2_0809-0823_10_ok")          # 数据集根目录（改成你的）
ROOT = Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/03_16mp_fiftyone_dataset/sw1_0605-0613_07_ok")          # 数据集根目录（改成你的）

INPUT_JSON = ROOT / "labels.json"          # 原始 COCO 标注
OUTPUT_JSON = ROOT / "org_label_no_overlap.json"  # 输出的新 COCO 标注
IOU_THRESH = 0.5                   # 重叠阈值，大于这个就认为是重复框


def bbox_iou(b1, b2):
    """
    计算两个 COCO bbox 的 IoU
    COCO bbox 格式: [x, y, w, h]
    """
    x1, y1, w1, h1 = b1
    x2, y2, w2, h2 = b2

    xa1 = x1
    ya1 = y1
    xa2 = x1 + w1
    ya2 = y1 + h1

    xb1 = x2
    yb1 = y2
    xb2 = x2 + w2
    yb2 = y2 + h2

    inter_x1 = max(xa1, xb1)
    inter_y1 = max(ya1, yb1)
    inter_x2 = min(xa2, xb2)
    inter_y2 = min(ya2, yb2)

    inter_w = max(0.0, inter_x2 - inter_x1)
    inter_h = max(0.0, inter_y2 - inter_y1)
    inter_area = inter_w * inter_h

    area_a = w1 * h1
    area_b = w2 * h2
    union_area = area_a + area_b - inter_area

    if union_area <= 0:
        return 0.0

    return inter_area / union_area


def nms_simple(anns, iou_thresh):
    """
    对同一张图片的一组 annotations 做简单 NMS 去重。
    anns: list of annotation dict (同一个 image_id，同一个 category_id)
    策略：按 bbox 面积从大到小排序，依次加入 keep，
          若与已有 keep 中任意一个 IoU > 阈值，则丢掉。
    """
    # 按面积从大到小排序
    def area(ann):
        x, y, w, h = ann["bbox"]
        return w * h

    anns_sorted = sorted(anns, key=area, reverse=True)

    kept = []
    for ann in anns_sorted:
        keep_this = True
        for k in kept:
            iou = bbox_iou(ann["bbox"], k["bbox"])
            if iou > iou_thresh:
                keep_this = False
                break
        if keep_this:
            kept.append(ann)

    return kept


def main():
    input_path = Path(INPUT_JSON)
    output_path = Path(OUTPUT_JSON)

    with input_path.open("r", encoding="utf-8") as f:
        coco = json.load(f)

    images = coco.get("images", [])
    annotations = coco.get("annotations", [])
    categories = coco.get("categories", [])

    # 建立 (image_id, category_id) -> [anns...]
    groups = {}
    for ann in annotations:
        key = (ann["image_id"], ann["category_id"])
        groups.setdefault(key, []).append(ann)

    new_annotations = []

    for (image_id, category_id), anns in groups.items():
        # 对每一组做去重
        kept = nms_simple(anns, IOU_THRESH)
        new_annotations.extend(kept)

    # 构建新的 COCO 字典
    new_coco = deepcopy(coco)
    new_coco["annotations"] = new_annotations

    with output_path.open("w", encoding="utf-8") as f:
        json.dump(new_coco, f, indent=2, ensure_ascii=False)

    print(f"Done. 原始标注数: {len(annotations)}, 去重后: {len(new_annotations)}")
    print(f"保存到: {output_path}")


if __name__ == "__main__":
    main()


Done. 原始标注数: 11, 去重后: 6
保存到: /home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/03_16mp_fiftyone_dataset/sw1_0605-0613_07_ok/org_label_no_overlap.json
