In [1]:
# ========= Cell 1: 填写任务信息 =========
from datetime import datetime
import ipynbname
import os

task_name = "合并json"
notebook_name = "XGBoost.ipynb"  # 不带扩展名
notebook_path = r"CASA0004\XGBoost.ipynb"  # 完整路径
dataset = r"E:\Dissertation\CASA0004\central_barrio_features.csv"
code_version = "v1.0_X, (1 part of xgboost)"
input_dir = r"E:\Dissertation\XGBoost_cleaning\label_studio_config_central_new.json"   # 输入 CSV
input_json = r"E:\Dissertation\XGBoost_cleaning\label_studio_config_central.json"
output_dir = r"E:\Dissertation\XGBoost_cleaning"       # 输出文件夹
note = "把csv里面的na全都填上0，然后训练xgboost预测谋杀率"

# 保证输出目录存在
os.makedirs(output_dir, exist_ok=True)

print(f"任务: {task_name}")
print(f"输入文件: {input_dir}")
print(f"输出目录: {output_dir}")


任务: 合并json
输入文件: E:\Dissertation\XGBoost_cleaning\label_studio_config_central_new.json
输出目录: E:\Dissertation\XGBoost_cleaning


In [2]:
import json

with open(input_dir, "r", encoding="utf-8") as f:
    data1 = json.load(f)

with open(input_json, "r", encoding="utf-8") as f:
    data2 = json.load(f)

# 两个列表拼接
merged = data1 + data2

with open("label_studio_config_central_merged.json", "w", encoding="utf-8") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)


In [4]:
import json
from collections import defaultdict

# 输入 JSON 文件路径
input_json = "E:\Dissertation\CASA0004\label_studio_config_central_merged.json"

# 总采样点数
total_points = 34600  

# 加载 JSON
with open(input_json, "r", encoding="utf-8") as f:
    data = json.load(f)

# 统计每个 point 的图片数量
point_counts = defaultdict(int)
for item in data:
    pid = item["data"].get("sampling_point_id", "UNKNOWN")
    point_counts[pid] += 1

# 已出现的点（转为 int，方便排序和区间分析）
present_points = set(int(pid.split("_")[1]) for pid in point_counts.keys() if pid.startswith("point_"))

# 所有应有点 ID
all_points = set(range(1, total_points + 1))

# 缺失点（完全没有出现的点）
missing_points = sorted(all_points - present_points)

# 计算缺失区间
missing_intervals = []
if missing_points:
    start = missing_points[0]
    prev = missing_points[0]
    for p in missing_points[1:]:
        if p == prev + 1:
            prev = p
        else:
            missing_intervals.append((start, prev, prev - start + 1))  # (起点, 终点, 缺失数量)
            start = p
            prev = p
    missing_intervals.append((start, prev, prev - start + 1))  # 收尾

# 找出缺失最多的前 10 个区间
missing_intervals_sorted = sorted(missing_intervals, key=lambda x: x[2], reverse=True)[:10]

print(f"理论总采样点数: {total_points}")
print(f"缺失点总数: {len(missing_points)}")
print(f"缺失区间总数: {len(missing_intervals)}")
print("\n缺失最多的前 10 个区间:")
for start, end, length in missing_intervals_sorted:
    print(f"point_{start} ~ point_{end}  (缺失 {length} 个)")


  input_json = "E:\Dissertation\CASA0004\label_studio_config_central_merged.json"


理论总采样点数: 34600
缺失点总数: 11974
缺失区间总数: 1673

缺失最多的前 10 个区间:
point_1 ~ point_6383  (缺失 6383 个)
point_12032 ~ point_12068  (缺失 37 个)
point_20585 ~ point_20618  (缺失 34 个)
point_12555 ~ point_12585  (缺失 31 个)
point_20443 ~ point_20471  (缺失 29 个)
point_12133 ~ point_12158  (缺失 26 个)
point_16269 ~ point_16294  (缺失 26 个)
point_10183 ~ point_10207  (缺失 25 个)
point_20153 ~ point_20177  (缺失 25 个)
point_20377 ~ point_20400  (缺失 24 个)


In [None]:
# ===== 记录日志 =====
def append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note):
    repo_dir = 'E:\Dissertation\CASA0004'
    log_path = f"{repo_dir}/operation_log.md"

    # 写入日志
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"**任务名称**: {task_name}\n")
        f.write(f"**数据集**: {dataset}\n")
        f.write(f"**代码版本**: {code_version}\n")
        f.write(f"**输入目录**: {input_dir}\n")
        f.write(f"**输出目录**: {output_dir}\n")
        f.write(f"**状态**: {status}\n")
        f.write(f"**耗时**: {duration}\n")
        f.write(f"**备注**: {note}\n\n")

    print("✅ 日志写入完成")
status="finished"
append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note)