# 模型自训练伪标签迭代生成和优化 pipeline
# 目标

用已训练好的**虫子检测模型**批量跑多个原始图片目录，导出**裁切小图**与**分类训练数据**（供后续 YOLO-CLS 训练）。
&#x20;当前检测**只有 1 个类：****`insect`**；后续细分类将在分类阶段完成。

# 输入

* 多个原始图片目录（数组）：
  * `/workspace/models/SAHI/run_v9/ElderFarm1_0625_autofocus_rpicam_still_sliced`
  * `/workspace/models/SAHI/run_v9/ElderFarm2_0625_autofocus_rpicam_still_sliced`
  * （可继续追加更多路径）
* 检测模型路径：`/workspace/models/SAHI/run_v9/det_insect_best.pt`
* 检测类别集合：仅 `["insect"]`

# 输出（两种格式，任选其一或同时产出）

**A. YOLO-CLS / ImageNet 目录结构**（最省心）

```
dataset_cls/
  train/
    insect/              # 置信度≥阈值的裁切图
    other/               # （可选）低置信度、可作负样本/待复核
  val/
    insect/
    other/
  test/
    insect/
    other/

```

* 后续直接：`yolo task=cls mode=train data=dataset.yaml ...`

**B. 扁平裁切 + 清单 CSV**（便于溯源/筛选）

```
dataset_cls_flat/
  crops/
    <uuid>_x1y1x2y2_conf0.91_insect_<srcBase>.jpg
  manifests/
    train.csv   # path,label,det_conf,src_image,bbox,split
    val.csv
    test.csv

```

* **label** 一律为 `insect`（阈值下方可写成 `other` 或跳过）
* 可按 CSV 过滤/重采样，再转成 A 格式训练

# 流程

1. **批量预测（多目录）**
   * 递归遍历输入目录数组中的所有图片
   * 使用模型 `det_insect_best.pt`
   * 仅产出类别 `insect` 的检测框
2. **裁切与命名**
   * 对每个框从原图裁切，四周按比例留边（`pad_ratio`）
   * 命名：`<uuid>_x1y1x2y2_conf{:.2f}_insect_<srcBase>.jpg`
3. **标签生成**
   * A 格式：高于阈值→放 `insect/`；低于阈值→放 `other/`（或丢弃）
   * B 格式：写 CSV（含 `det_conf`、`src_image`、`bbox` 便于追踪）
4. **分层划分（避免数据泄漏）**
   * 先按**原图名/来源目录**分组，再 8:1:1 划分到 train/val/test
   * 同一原图的多个裁切不得跨 split
5. **去重与清理（可选）**
   * NMS 或 IoU 合并、质量过滤（尺寸/亮度/模糊度）
   * pHash 感知去重，减少重复裁切

# 建议默认参数

* `conf_thres = 0.40`（正负样本质量平衡；需时可调）
* `iou_thres = 0.60`（NMS）
* `min_box_size = 8` 像素（按分辨率调整）
* `pad_ratio = 0.08`（四周各 8%）
* `keep_lowconf = True`（低置信度进 `other/`；若不需要可设 False）
* `splits = (0.8, 0.1, 0.1)`（基于**原图组**的分层划分）

# 成功标准

* 产出可直接用于 YOLO-CLS 的数据：
  * **A 格式**：`dataset_cls/{train,val,test}/{insect,other}/...`
  * **或 B 格式**：`dataset_cls_flat/crops/*.jpg` + `manifests/*.csv`（字段完整）
* 每个裁切图的来源、置信度、bbox 可追溯；不同 split 无原图级泄漏。

# 示例调用（思路）

* 传入：`--input-dirs "dir1,dir2,dir3" --model det_insect_best.pt --out dataset_cls --fmt A --conf 0.4 --pad 0.08 --splits 0.8,0.1,0.1`
* 或同时导出 A+B：加 `--fmt A,B`

# 批量预测（多目录）

In [None]:
import os
import cv2
import torch
from pathlib import Path
from tqdm import tqdm
import glob
import json

class InsectDetector:
    def __init__(self, model_path, conf_thres=0.4):
        self.conf_thres = conf_thres
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.load_model(model_path)
    
    def load_model(self, model_path):
        try:
            from ultralytics import YOLO
            return YOLO(model_path)
        except ImportError:
            model = torch.hub.load('ultralytics/yolov5', 'custom', path=model_path)
            model.conf = self.conf_thres
            return model
    
    def find_images(self, input_dirs):
        image_files = []
        for input_dir in input_dirs:
            for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp']:
                image_files.extend(glob.glob(os.path.join(input_dir, '**', ext), recursive=True))
        return image_files
    
    def detect_batch(self, image_paths, batch_size=8):
        results = []
        print(f"检测 {len(image_paths)} 张图片...")
        
        for i in tqdm(range(0, len(image_paths), batch_size)):
            batch_paths = image_paths[i:i+batch_size]
            batch_images = [cv2.imread(p) for p in batch_paths]
            
            valid_indices = [idx for idx, img in enumerate(batch_images) if img is not None]
            valid_images = [batch_images[idx] for idx in valid_indices]
            valid_paths = [batch_paths[idx] for idx in valid_indices]
            
            if not valid_images:
                continue
                
            if hasattr(self.model, 'predict'):
                det_results = self.model(valid_images, conf=self.conf_thres, verbose=False)
                for path, det in zip(valid_paths, det_results):
                    if det.boxes is not None:
                        boxes = det.boxes.xyxy.cpu().numpy().tolist()
                        confs = det.boxes.conf.cpu().numpy().tolist()
                        detections = [{"bbox": box, "conf": conf, "cls": 0} for box, conf in zip(boxes, confs)]
                        results.append({"image_path": path, "detections": detections})
                    else:
                        results.append({"image_path": path, "detections": []})
        
        return results

def main_step1():
    input_dirs = [
        '/workspace/models/SAHI/run_v9/ElderFarm1_0625_autofocus_rpicam_still_sliced',
        '/workspace/models/SAHI/run_v9/ElderFarm2_0625_autofocus_rpicam_still_sliced'
    ]
    model_path = '/workspace/models/SAHI/run_v9/det_insect_best.pt'
    
    detector = InsectDetector(model_path, conf_thres=0.4)
    
    # 1. 查找图片
    print("查找图片...")
    image_files = detector.find_images(input_dirs)
    print(f"找到 {len(image_files)} 张图片")
    
    # 2. 批量检测（测试前20张）
    results = detector.detect_batch(image_files[:20])
    
    # 统计
    total_detections = sum(len(item["detections"]) for item in results)
    print(f"检测完成: {len(results)} 张图片, {total_detections} 个虫子检测")
    
    # 保存为JSONL
    os.makedirs('temp', exist_ok=True)
    with open('temp/step1_detection_results.jsonl', 'w', encoding='utf-8') as f:
        for item in results:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    print("结果已保存到: temp/step1_detection_results.jsonl")

if __name__ == "__main__":
    main_step1()

查找图片...
找到 226320 张图片
检测 200 张图片...


100%|██████████| 25/25 [00:00<00:00, 48.67it/s]

检测完成: 200 张图片, 19 个虫子检测
结果已保存到: /workspace/models/SAHI/run_v9/temp/step1_detection_results.jsonl





# 裁切与命名

In [13]:
import cv2
import os
from pathlib import Path
import uuid
import json
import pandas as pd
import numpy as np

class CropProcessor:
    def __init__(self, pad_ratio=0.08, target_size=224):
        self.pad_ratio = pad_ratio
        self.target_size = target_size
    
    def crop_square_with_padding(self, img, bbox):
        """将检测框裁切为带填充的正方形"""
        x1, y1, x2, y2 = bbox
        h, w = img.shape[:2]
        
        # 计算中心点和最大边长
        center_x = (x1 + x2) / 2
        center_y = (y1 + y2) / 2
        max_side = max(x2 - x1, y2 - y1) * (1 + self.pad_ratio)
        
        # 计算正方形边界
        half_side = max_side / 2
        x1_square = max(0, center_x - half_side)
        y1_square = max(0, center_y - half_side)
        x2_square = min(w, center_x + half_side)
        y2_square = min(h, center_y + half_side)
        
        # 裁切
        crop = img[int(y1_square):int(y2_square), int(x1_square):int(x2_square)]
        
        # 如果裁切区域不是正方形（边界情况），进行填充
        crop_h, crop_w = crop.shape[:2]
        if crop_h != crop_w:
            # 创建正方形画布
            size = max(crop_h, crop_w)
            square_crop = np.zeros((size, size, 3), dtype=np.uint8)
            
            # 计算填充位置（居中）
            pad_h = (size - crop_h) // 2
            pad_w = (size - crop_w) // 2
            
            square_crop[pad_h:pad_h+crop_h, pad_w:pad_w+crop_w] = crop
            crop = square_crop
        
        # 调整到目标尺寸
        crop = cv2.resize(crop, (self.target_size, self.target_size))
        return crop
    
    def crop_and_save(self, detection_results, output_dir):
        """裁切检测框并保存为正方形"""
        os.makedirs(output_dir, exist_ok=True)
        crop_data = []
        
        for item in detection_results:
            img_path = item["image_path"]
            detections = item["detections"]
            
            img = cv2.imread(img_path)
            if img is None:
                continue
                
            src_base = Path(img_path).stem
            
            for det in detections:
                bbox = det["bbox"]
                conf = det["conf"]
                
                # 裁切为正方形
                crop = self.crop_square_with_padding(img, bbox)
                
                if crop.size == 0:
                    continue
                
                # 生成文件名
                x1, y1, x2, y2 = bbox
                crop_name = f"{uuid.uuid4().hex[:8]}_{x1:.0f}{y1:.0f}{x2:.0f}{y2:.0f}_conf{conf:.2f}_insect_{src_base}.jpg"
                crop_path = os.path.join(output_dir, crop_name)
                
                # 保存正方形裁切图
                cv2.imwrite(crop_path, crop)
                
                crop_data.append({
                    'crop_path': crop_path,
                    'src_image': img_path,
                    'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2,
                    'conf': conf,
                    'label': 'insect',
                    'crop_size': self.target_size
                })
        
        return crop_data

def main_step2():
    # 加载第1步的检测结果
    detection_results = []
    with open('/workspace/models/SAHI/run_v9/temp/step1_detection_results.jsonl', 'r', encoding='utf-8') as f:
        for line in f:
            detection_results.append(json.loads(line.strip()))
    
    # 裁切处理（输出正方形）
    processor = CropProcessor(pad_ratio=0.08, target_size=224)
    crop_data = processor.crop_and_save(detection_results, '/workspace/models/SAHI/run_v9/temp/crops')
    
    # 保存为CSV
    df = pd.DataFrame(crop_data)
    df.to_csv('/workspace/models/SAHI/run_v9/temp/step2_crop_info.csv', index=False, encoding='utf-8')

    print(f"裁切完成: {len(crop_data)} 个正方形小图")
    print(f"所有图片尺寸: {processor.target_size}x{processor.target_size}")
    print(f"裁切信息已保存: /workspace/models/SAHI/run_v9/temp/step2_crop_info.csv")

if __name__ == "__main__":
    main_step2()

裁切完成: 19 个正方形小图
所有图片尺寸: 224x224
裁切信息已保存: /workspace/models/SAHI/run_v9/temp/step2_crop_info.csv


# 标签生成

# 分层划分（避免数据泄漏）

# 去重与清理（可选）