In [8]:
# ========= Cell 1: 填写任务信息 ========= # 
task_name = "join kill count" 
dataset = "Bogota_points" 
notebook_name = "03JSON_match_homicide_count copy.ipynb"
notebook_path = "CASA0004\03JSON_match_homicide_count copy.ipynb"
code_version = "v1.1(based on json_interselect.ipynb)" 
input_dir = "E:/Dissertation/XGBoost_cleaning/2016north_Mask2Former_with_geobarrio" # 输入文件夹 
output_dir = "E:/Dissertation/XGBoost_cleaning/2016north_Mask2Former_with_geobarriokilling" # 输出文件夹 
note = "南部区块-保持原有batch结构，增加了杀人数量字段(Cantidad)，用Matched_Barrio和json里面的barrio字段匹配，匹配前要把json里面的都变小写" 
kill_file = r"E:\Dissertation\CASA0004\2016homicide_barrio_upz_barriocount_final.csv"

In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm

# === 读入杀人数据 ===
kill_df = pd.read_csv(kill_file, encoding="utf-8-sig")

# 建立映射：Matched_Barrio → Cantidad
kill_map = dict(zip(kill_df["Matched_Barrio"].astype(str).str.lower(), kill_df["Cantidad"]))

# === 遍历输入文件夹（递归） ===
os.makedirs(output_dir, exist_ok=True)

matched_count = 0
total_count = 0

for root, _, files in os.walk(input_dir):
    for fname in files:
        if not fname.endswith(".json"):
            continue

        in_path = os.path.join(root, fname)
        # 保持原有目录层级
        rel_path = os.path.relpath(in_path, input_dir)
        out_path = os.path.join(output_dir, rel_path)
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

        with open(in_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        if "barrio" in data:
            total_count += 1
            barrio_norm = str(data["barrio"]).lower().strip()
            if barrio_norm in kill_map:
                data["Cantidad"] = float(kill_map[barrio_norm])
                matched_count += 1
            else:
                data["Cantidad"] = 0.0  # 没匹配到 → 填 0

        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

# === 输出统计 ===
print(f"总数: {total_count}")
print(f"匹配上: {matched_count}")
if total_count > 0:
    print(f"占比: {matched_count/total_count:.2%}")
else:
    print("⚠️ 没有 JSON 文件被处理到，检查 input_dir 是否正确。")


总数: 50764
匹配上: 6944
占比: 13.68%


In [10]:
# ===== 记录日志 =====
from datetime import datetime
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note):
    repo_dir = 'E:\Dissertation\CASA0004'
    log_path = f"{repo_dir}/operation_log.md"

    # 写入日志
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"**任务名称**: {task_name}\n")
        f.write(f"**任务文件**: {notebook_name}\n")
        f.write(f"**文件路径**: {notebook_path}\n")
        f.write(f"**数据集**: {dataset}\n")
        f.write(f"**代码版本**: {code_version}\n")
        f.write(f"**输入目录**: {input_dir}\n")
        f.write(f"**输出目录**: {output_dir}\n")
        f.write(f"**状态**: {status}\n")
        f.write(f"**耗时**: {duration}\n")
        f.write(f"**备注**: {note}\n")
        f.write(f"**记录时间**: {current_time}\n\n")
        f.write("================分割线================\n\n")

    print("✅ 日志写入完成")
status="完成，north匹配完成，投影正确"
duration = "2min"
append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note)

✅ 日志写入完成


  repo_dir = 'E:\Dissertation\CASA0004'
