In [17]:
# ========= Cell 1: 填写任务信息 =========
from datetime import datetime
task_name = "filter_south"
dataset = "2016south_Mask2Former_with_geobarriokilling"
code_version = "v1.3(third part of cleaning)"
input_dir = "E:/Dissertation/XGBoost_cleaning/2016south_Mask2Former_with_geobarriokilling"     # 输入文件夹
output_dir = "E:/Dissertation/XGBoost_cleaning/2016south_Mask2Former_with_geobarriokilling_SelectedSegClass"   # 输出文件夹
note = "保持原有batch结构,只保留fence,wall,Road,Sidewalk,Building,Person,Bicyclist,Motorcyclist,Other Rider,Sky,Vegetation,Street Light相关字段"
config_file = r"E:\Dissertation\CASA0004\config_v1.2.json"

In [18]:
import os
import json
from tqdm import tqdm

# 目标类别（用 readable 名称指定）
target_classes = [
    "Fence", "Wall", "Road", "Sidewalk", "Building",
    "Person", "Bicyclist", "Motorcyclist", "Other Rider",
    "Sky", "Vegetation", "Street Light"
]

# ========= 读取 config，建立 readable → name 的映射 =========
with open(config_file, "r", encoding="utf-8") as f:
    config = json.load(f)

readable_to_name = {label["readable"]: label["name"] for label in config["labels"]}

# ========= Dryrun 检查 =========
print("=== Dryrun 检查 ===")
target_names = []
for cls in target_classes:
    if cls in readable_to_name:
        print(f"[OK] {cls} -> {readable_to_name[cls]}")
        target_names.append(readable_to_name[cls])
    else:
        print(f"[MISSING] {cls} 在 config 中未找到对应 'name'!")

if len(target_names) < len(target_classes):
    print("⚠️ 有目标类别未找到，请检查 config")
else:
    print("✅ 所有类别都已匹配成功，可以继续执行筛选。")

# ========= 真正的处理（递归子文件夹） =========
if len(target_names) == len(target_classes):
    for root, dirs, files in os.walk(input_dir):
        rel_root = os.path.relpath(root, input_dir)  # 保留目录层级
        out_root = os.path.join(output_dir, rel_root)
        os.makedirs(out_root, exist_ok=True)

        for fname in tqdm(files, desc=f"处理 {rel_root}"):
            if not fname.endswith(".json"):
                continue
            in_path = os.path.join(root, fname)
            out_path = os.path.join(out_root, fname)

            with open(in_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # 只保留目标类别
            filtered_segments = [
                seg for seg in data.get("segments", [])
                if seg.get("label_name") in target_names
            ]
            data["segments"] = filtered_segments

            with open(out_path, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)


=== Dryrun 检查 ===
[OK] Fence -> construction--barrier--fence
[OK] Wall -> construction--barrier--wall
[OK] Road -> construction--flat--road
[OK] Sidewalk -> construction--flat--sidewalk
[OK] Building -> construction--structure--building
[OK] Person -> human--person
[OK] Bicyclist -> human--rider--bicyclist
[OK] Motorcyclist -> human--rider--motorcyclist
[OK] Other Rider -> human--rider--other-rider
[OK] Sky -> nature--sky
[OK] Vegetation -> nature--vegetation
[OK] Street Light -> object--street-light
✅ 所有类别都已匹配成功，可以继续执行筛选。


处理 .: 0it [00:00, ?it/s]


处理 batch_001: 100%|██████████| 200/200 [00:00<00:00, 496.37it/s]
处理 batch_002: 100%|██████████| 200/200 [00:00<00:00, 484.37it/s]
处理 batch_003: 100%|██████████| 200/200 [00:00<00:00, 457.84it/s]
处理 batch_004: 100%|██████████| 200/200 [00:00<00:00, 428.49it/s]
处理 batch_005: 100%|██████████| 200/200 [00:00<00:00, 453.70it/s]
处理 batch_006: 100%|██████████| 200/200 [00:00<00:00, 493.93it/s]
处理 batch_007: 100%|██████████| 200/200 [00:00<00:00, 510.26it/s]
处理 batch_008: 100%|██████████| 200/200 [00:00<00:00, 429.41it/s]
处理 batch_009: 100%|██████████| 200/200 [00:00<00:00, 422.94it/s]
处理 batch_010: 100%|██████████| 200/200 [00:00<00:00, 490.30it/s]
处理 batch_011: 100%|██████████| 200/200 [00:00<00:00, 501.34it/s]
处理 batch_012: 100%|██████████| 200/200 [00:00<00:00, 477.46it/s]
处理 batch_013: 100%|██████████| 200/200 [00:00<00:00, 530.52it/s]
处理 batch_014: 100%|██████████| 200/200 [00:00<00:00, 477.46it/s]
处理 batch_015: 100%|██████████| 200/200 [00:00<00:00, 492.71it/s]
处理 batch_016: 100%|█████

In [19]:
# ===== 记录日志 =====
status="南部街景语义类别删选完成" 
duration = "1m57s"
def append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note):
    repo_dir = 'E:\Dissertation\CASA0004'
    log_path = f"{repo_dir}/operation_log.md"

    # 写入日志
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"**任务名称**: {task_name}\n")
        f.write(f"**数据集**: {dataset}\n")
        f.write(f"**代码版本**: {code_version}\n")
        f.write(f"**输入目录**: {input_dir}\n")
        f.write(f"**输出目录**: {output_dir}\n")
        f.write(f"**状态**: {status}\n")
        f.write(f"**耗时**: {duration}\n")
        f.write(f"**备注**: {note}\n\n")

    print("✅ 日志写入完成")
append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note)

✅ 日志写入完成


  repo_dir = 'E:\Dissertation\CASA0004'


In [16]:
# ===== 记录日志 =====
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note):
    repo_dir = 'E:\Dissertation\CASA0004'
    log_path = f"{repo_dir}/operation_log.md"

    # 写入日志
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"**任务名称**: {task_name}\n")
        f.write(f"**数据集**: {dataset}\n")
        f.write(f"**代码版本**: {code_version}\n")
        f.write(f"**输入目录**: {input_dir}\n")
        f.write(f"**输出目录**: {output_dir}\n")
        f.write(f"**状态**: {status}\n")
        f.write(f"**耗时**: {duration}\n")
        f.write(f"**备注**: {note}\n")
        f.write(f"**记录时间**: {current_time}\n\n")
        f.write("================分割线================\n\n")

    print("✅ 日志写入完成")
status="finished"
append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note)

✅ 日志写入完成


  repo_dir = 'E:\Dissertation\CASA0004'
