In [15]:
import json
from pathlib import Path
import cv2
from tqdm import tqdm


def crop_bbox_to_tile(bbox, tile_x, tile_y, tile_size):
    x, y, w, h = bbox
    x2, y2 = x + w, y + h

    tx1, ty1 = tile_x, tile_y
    tx2, ty2 = tile_x + tile_size, ty1 + tile_size

    ix1 = max(x, tx1)
    iy1 = max(y, ty1)
    ix2 = min(x2, tx2)
    iy2 = min(y2, ty2)

    inter_w = max(0.0, ix2 - ix1)
    inter_h = max(0.0, iy2 - iy1)

    if inter_w <= 0 or inter_h <= 0:
        return None, 0.0

    inter_area = inter_w * inter_h
    orig_area = max(w * h, 1e-6)
    keep_ratio = inter_area / orig_area

    return [ix1 - tx1, iy1 - ty1, inter_w, inter_h], keep_ratio


def make_positions(length, crop_size, stride):
    if length <= crop_size:
        return [0]

    pos_list, pos = [], 0
    while pos + crop_size < length:
        pos_list.append(pos)
        pos += stride

    last = length - crop_size
    if not pos_list or pos_list[-1] != last:
        pos_list.append(last)

    return pos_list


def export_null_tiles_from_coco(
    data_root: Path,
    img_dir: Path,
    coco_json: Path,
    out_img_dir: Path,
    out_label_dir: Path,
    crop_size: int = 640,
    overlap_ratio: float = 0.2,
    keep_ratio: float = 0.9,
):
    """
    从 COCO 标注中导出「完全没有有效标注覆盖」的 null tiles。
    每个 tile 对应一个空的 YOLO txt 标签文件。

    返回：
        kept_null_tiles (int)
    """

    stride = int(crop_size * (1 - overlap_ratio))

    out_img_dir.mkdir(parents=True, exist_ok=True)
    out_label_dir.mkdir(parents=True, exist_ok=True)

    with coco_json.open("r", encoding="utf-8") as f:
        coco = json.load(f)

    images = coco["images"]
    annotations = coco["annotations"]

    # image_id -> annotations
    anns_by_img = {}
    for ann in annotations:
        anns_by_img.setdefault(ann["image_id"], []).append(ann)

    kept_null_tiles = 0

    for img_info in tqdm(images, desc="Exporting null tiles"):
        img_id = img_info["id"]
        file_name = img_info["file_name"]
        img_path = img_dir / file_name

        img = cv2.imread(str(img_path))
        if img is None:
            print(f"[WARN] Cannot read image: {img_path}")
            continue

        h, w = img.shape[:2]
        xs = make_positions(w, crop_size, stride)
        ys = make_positions(h, crop_size, stride)

        orig_anns = anns_by_img.get(img_id, [])

        for ty in ys:
            for tx in xs:
                tile = img[ty:ty + crop_size, tx:tx + crop_size]

                # 判断该 tile 是否有「有效标注」
                has_label = False
                for ann in orig_anns:
                    _, ratio = crop_bbox_to_tile(
                        ann["bbox"], tx, ty, crop_size
                    )
                    if ratio >= keep_ratio:
                        has_label = True
                        break

                # 只保留 null tiles
                if has_label:
                    continue

                new_file = f"{Path(file_name).stem}_{tx}_{ty}{Path(file_name).suffix}"
                out_img_path = out_img_dir / new_file
                cv2.imwrite(str(out_img_path), tile)

                out_label_path = out_label_dir / f"{Path(new_file).stem}.txt"
                out_label_path.write_text("")

                kept_null_tiles += 1

    return kept_null_tiles


In [6]:
from os import path


root = Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone")
path_list = [path for path in root.iterdir() if path.is_dir()]
path_list

[PosixPath('/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms2_0726-0809_13_ok'),
 PosixPath('/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/sw1_0605-0613_07_ok'),
 PosixPath('/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0809-0823_34_ok'),
 PosixPath('/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0710-0726_36_ok'),
 PosixPath('/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0605-0621_40_ok'),
 PosixPath('/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms2_0809-0823_10_ok'),
 PosixPath('/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0726

In [7]:
if __name__ == "__main__":
    # DATA_ROOT = Path(
    #     "/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/00_test/00_try/del"
    # )

    for DATA_ROOT in path_list:
        kept = export_null_tiles_from_coco(
            data_root=DATA_ROOT,
            img_dir=DATA_ROOT / "data",
            coco_json=DATA_ROOT / "labels.json",
            out_img_dir=DATA_ROOT / "null_images_640_ov20",
            out_label_dir=DATA_ROOT / "null_labels_640_ov20",
            crop_size=640,
            overlap_ratio=0.2,
            keep_ratio=0.9,
        )

        print("\n==== 完成：只导出 null tiles ====")
        print(f"null tiles 数量: {kept}")


Exporting null tiles: 100%|██████████| 279/279 [00:33<00:00,  8.35it/s]



==== 完成：只导出 null tiles ====
null tiles 数量: 16887


Exporting null tiles: 100%|██████████| 6/6 [00:00<00:00,  9.56it/s]



==== 完成：只导出 null tiles ====
null tiles 数量: 367


Exporting null tiles: 100%|██████████| 323/323 [00:41<00:00,  7.75it/s]



==== 完成：只导出 null tiles ====
null tiles 数量: 19590


Exporting null tiles: 100%|██████████| 471/471 [01:13<00:00,  6.41it/s]



==== 完成：只导出 null tiles ====
null tiles 数量: 28036


Exporting null tiles: 100%|██████████| 243/243 [00:28<00:00,  8.51it/s]



==== 完成：只导出 null tiles ====
null tiles 数量: 13711


Exporting null tiles: 100%|██████████| 578/578 [01:04<00:00,  8.95it/s]



==== 完成：只导出 null tiles ====
null tiles 数量: 35182


Exporting null tiles: 100%|██████████| 464/464 [00:51<00:00,  8.99it/s]


==== 完成：只导出 null tiles ====
null tiles 数量: 27649





### 抽取数据

In [28]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Tuple
import random
import shutil


@dataclass(frozen=True)
class YoloPair:
    img: Path
    lbl: Optional[Path]  # null 样本可能没有 label；但你现在有 null_labels_640_ov20，所以一般不为 None


def _list_images(images_dir: Path) -> List[Path]:
    exts = {".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tif", ".tiff"}
    files = [p for p in images_dir.iterdir() if p.is_file() and p.suffix.lower() in exts]
    files.sort()
    return files


def _match_label_for_image(img_path: Path, labels_dir: Path) -> Optional[Path]:
    # YOLO labels 通常是同名 .txt
    lbl = labels_dir / f"{img_path.stem}.txt"
    return lbl if lbl.exists() else None


def sample_yolo_pairs_random(
    *,
    src_images_dir: Path,
    src_labels_dir: Path,
    out_root_dir: Path,
    ratio: Optional[float] = None,
    k: Optional[int] = None,
    seed: int = 42,
    strict_pair: bool = True,
    strict_count: bool = True,
    copy_labels: bool = True,
) -> int:
    """
    Randomly sample YOLO (image, label) pairs from src_images_dir/src_labels_dir into out_root_dir.

    Choose exactly ONE of:
      - ratio: float in (0,1], samples floor(n * ratio)
      - k: int >= 0, samples exactly k (or min(k, n) if strict_count=False)

    strict_pair:
      - True  -> any sampled image without a label (txt) raises error
      - False -> allow missing labels (lbl=None), still copies images; labels copy skipped

    strict_count:
      - True  -> if k > n_total, raise error
      - False -> if k > n_total, use k = n_total

    Returns:
      - number of samples actually kept (int)
    """
    # --- validate mode ---
    if (ratio is None) == (k is None):
        raise ValueError("Specify exactly one of ratio=... or k=... (not both, not neither).")

    if ratio is not None:
        if not (0 < ratio <= 1):
            raise ValueError(f"ratio must be in (0,1], got {ratio}")

    if k is not None:
        if k < 0:
            raise ValueError(f"k must be >= 0, got {k}")

    src_images_dir = Path(src_images_dir)
    src_labels_dir = Path(src_labels_dir)
    out_root_dir = Path(out_root_dir)

    if not src_images_dir.exists():
        raise FileNotFoundError(f"src_images_dir not found: {src_images_dir}")
    if not src_labels_dir.exists():
        raise FileNotFoundError(f"src_labels_dir not found: {src_labels_dir}")

    all_imgs = _list_images(src_images_dir)
    n_total = len(all_imgs)
    if n_total == 0:
        raise ValueError(f"No images found in {src_images_dir}")

    # --- decide k_final ---
    if ratio is not None:
        k_final = int(n_total * ratio)
        # 防止 ratio 很小导致 0
        k_final = max(1, k_final) if n_total > 0 and ratio > 0 else 0
    else:
        k_final = k

    if k_final is None:
        raise RuntimeError("Internal error: k_final is None")

    if k_final > n_total:
        if strict_count:
            raise ValueError(f"Requested k={k_final}, but only {n_total} images available in {src_images_dir}")
        k_final = n_total

    # --- sample ---
    rng = random.Random(seed)
    chosen_imgs = rng.sample(all_imgs, k_final)

    # --- build pairs + validate pair ---
    pairs: List[YoloPair] = []
    for img in chosen_imgs:
        lbl = _match_label_for_image(img, src_labels_dir)
        if strict_pair and lbl is None:
            raise ValueError(f"Missing label for image: {img.name} (expected {src_labels_dir}/{img.stem}.txt)")
        pairs.append(YoloPair(img=img, lbl=lbl))

    # --- prepare output dirs ---
    out_images = out_root_dir / "images"
    out_labels = out_root_dir / "labels"
    out_images.mkdir(parents=True, exist_ok=True)
    if copy_labels:
        out_labels.mkdir(parents=True, exist_ok=True)

    # --- copy ---
    for p in pairs:
        shutil.copy2(p.img, out_images / p.img.name)
        if copy_labels and p.lbl is not None:
            shutil.copy2(p.lbl, out_labels / p.lbl.name)

    return len(pairs)


In [30]:
from pathlib import Path

SEED = 42


DATASETS_WITH_SPEC = {
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0710-0726_36_ok"): {"k": 1637},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0809-0823_34_ok"): {"k": 759},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms2_0726-0809_13_ok"): {"k": 690},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/sw1_0605-0613_07_ok"): {"k": 11},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0605-0621_40_ok"): {"k": 1598},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0726-0809_11_ok"): {"k": 1583},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms2_0809-0823_10_ok"): {"k": 1232},
}

DATASETS_WITH_SPEC = {
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0710-0726_36_ok"): {"k": 327},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0809-0823_34_ok"): {"k": 152},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms2_0726-0809_13_ok"): {"k": 138},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/sw1_0605-0613_07_ok"): {"k": 2},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0605-0621_40_ok"): {"k": 320},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0726-0809_11_ok"): {"k": 317},
    Path("/home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms2_0809-0823_10_ok"): {"k": 246},
}

if __name__ == "__main__":
    print("\n==== Random sampling null tiles (ratio/k dual mode, reproducible) ====\n")

    for dataset_dir, spec in DATASETS_WITH_SPEC.items():
        name = dataset_dir.name

        src_images = dataset_dir / "null_images_640_ov20"
        src_labels = dataset_dir / "null_labels_640_ov20"

        # 输出目录名：把 ratio 或 k 体现在名字里，防止覆盖
        if "ratio" in spec:
            ratio = float(spec["ratio"])
            tag = f"r{ratio:.4f}"
            print(f"[{name}] mode=ratio ratio={ratio:.6f}")
            out_dir = dataset_dir / f"{name}_null_samples_{tag}"
            kept = sample_yolo_pairs_random(
                src_images_dir=src_images,
                src_labels_dir=src_labels,
                out_root_dir=out_dir,
                ratio=ratio,
                seed=SEED,
                strict_pair=True,
                strict_count=True,
            )
        elif "k" in spec:
            k = int(spec["k"])
            tag = f"k{k}"
            print(f"[{name}] mode=k k={k}")
            out_dir = dataset_dir / f"{name}_null_samples_{tag}"
            kept = sample_yolo_pairs_random(
                src_images_dir=src_images,
                src_labels_dir=src_labels,
                out_root_dir=out_dir,
                k=k,
                seed=SEED,
                strict_pair=True,
                strict_count=False,  # 不够就自动取 min(k, available)。你想强制报错就改 True
            )
        else:
            raise ValueError(f"Spec must contain 'ratio' or 'k': {spec}")

        print(f"  -> kept {kept} samples")
        print(f"  -> out: {out_dir}\n")

    print("==== Done ====")



==== Random sampling null tiles (ratio/k dual mode, reproducible) ====

[ms1_0710-0726_36_ok] mode=k k=327
  -> kept 327 samples
  -> out: /home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0710-0726_36_ok/ms1_0710-0726_36_ok_null_samples_k327

[ms1_0809-0823_34_ok] mode=k k=152
  -> kept 152 samples
  -> out: /home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms1_0809-0823_34_ok/ms1_0809-0823_34_ok_null_samples_k152

[ms2_0726-0809_13_ok] mode=k k=138
  -> kept 138 samples
  -> out: /home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/ms2_0726-0809_13_ok/ms2_0726-0809_13_ok_null_samples_k138

[sw1_0605-0613_07_ok] mode=k k=2
  -> kept 2 samples
  -> out: /home/tianqi/D/01_Projects/01_swd/02_code/pipeline/ultralytics_ty/_ty/01_data/a02_16mp_2024_datasets_fiftyone/sw1_0605-0613_07_ok/sw1_0605-0613_07_ok_null_s