In [3]:
# ========= Cell 1: 填写任务信息 =========
from datetime import datetime
import ipynbname
task_name = "南部转为CSV"
notebook_name = "barrio_population_calculate2.0.ipynb"  # 不带扩展名
notebook_path = "CASA0004\barrio_population_calculate2.0.ipynb"  # 完整路径
dataset = "2016 JSON & 2016 barrio with pop density"
code_version = "v1.2_G, (2 part of geo cleaning)"
input_dir = r"E:\Dissertation\XGBoost_cleaning\2016north_Mask2Former_with_geobarriokilling_SelectedSegClass" #这个文件夹里是mask2form识别结果
geojson_dir = r"E:\Dissertation\CASA0004\barrio_pop_density_clenaed.geojson"    # 这个是geojson
output_dir = r"E:\Dissertation\XGBoost_cleaning\2016north_Mask2Former_with_geobarriokilling_SelectedSegClass_Transtocsv"   # 输出文件夹
note = "1.把每张图片的fence,wall,Road,Sidewalk,Building,Person,Bicyclist,Motorcyclist,Other Rider,Sky,Vegetation,Street Light这些离散的segement转为比率与计数变量,比率变量：fence,wall,Road,Sidewalk,Building,Sky,Vegetation（在一张图片里面的占比，字段形如\"area_ratio\": 0.310933），计数变量: Person,Bicyclist,Motorcyclist,Other Rider,Street light提取它在一张图片，（一个json里面的出现次数）,json里面的经纬度，upz码等变量请保留，在这一步完成后输出一个csv,一行是一个json的信息2.以barrio名称为group的依据，计算各项在某barrio下的平均值,一个barrio为一行，上述的各个变量为列；按照barrio名匹配geojson（这个json里面有人口数和密度数据）用图片识别输出json里面我整合过的谋杀量去除barrio json里面的population算谋杀率出来，这个是因变量3.在以上分组匹配完成后输出一个能进XGBoost的csv"

In [4]:
# ========= Cell 1: 转为 CSV 并聚合 =========
import os
import json
import pandas as pd
import geopandas as gpd
from pathlib import Path
os.makedirs(output_dir, exist_ok= True)
# 目标类别映射
ratio_labels = {
    "construction--barrier--fence": "fence_ratio",
    "construction--barrier--wall": "wall_ratio",
    "construction--flat--road": "road_ratio",
    "construction--flat--sidewalk": "sidewalk_ratio",
    "construction--structure--building": "building_ratio",
    "nature--sky": "sky_ratio",
    "nature--vegetation": "vegetation_ratio"
}

count_labels = {
    "human--person": "person_count",
    "human--rider--bicyclist": "bicyclist_count",
    "human--rider--motorcyclist": "motorcyclist_count",
    "human--rider--other-rider": "other_rider_count",
    "object--street-light": "street_light_count"
}

all_csvs = []  # 记录每个子文件夹的 csv 路径

# ========= 遍历子文件夹 =========
for sub_dir in Path(input_dir).iterdir():
    if not sub_dir.is_dir():
        continue
    
    records = []
    for file in sub_dir.glob("*.json"):
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)

        record = {
            "image_name": data.get("image_name"),
            "longitude": data.get("longitude"),
            "latitude": data.get("latitude"),
            "barrio": data.get("barrio"),
            "upz_code": data.get("upz_code"),
            "Cantidad": data.get("Cantidad", 0.0)
        }

        for v in ratio_labels.values():
            record[v] = 0.0
        for v in count_labels.values():
            record[v] = 0

        for seg in data.get("segments", []):
            label = seg.get("label_name")
            if label in ratio_labels:
                record[ratio_labels[label]] += seg.get("area_ratio", 0.0)
            if label in count_labels:
                record[count_labels[label]] += 1

        records.append(record)

    if not records:
        continue

    df_sub = pd.DataFrame(records)

    # 保存每个子文件夹一个 CSV
    sub_csv_path = os.path.join(output_dir, f"{sub_dir.name}_features.csv")
    df_sub.to_csv(sub_csv_path, index=False, encoding="utf-8-sig")
    all_csvs.append(sub_csv_path)
    print(f"完成 {sub_dir.name} -> {sub_csv_path}, 共 {len(df_sub)} 行")

# ========= 合并所有子文件夹 CSV =========
dfs = [pd.read_csv(csv) for csv in all_csvs]
df_all = pd.concat(dfs, ignore_index=True)
csv_path = os.path.join(output_dir, "images_features.csv")
df_all.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"已输出逐图像特征总表: {csv_path}")

# ========= 按 barrio 聚合 =========
df_grouped = df_all.groupby("barrio").mean(numeric_only=True).reset_index()

gdf = gpd.read_file(geojson_dir)
geo_df = gdf[["barriocomu", "population", "pop_density"]].rename(columns={"barriocomu": "barrio"})

merged = df_grouped.merge(geo_df, on="barrio", how="left")
merged["murder_rate"] = merged["Cantidad"] / merged["population"]

final_csv = os.path.join(output_dir, "barrio_features.csv")
merged.to_csv(final_csv, index=False, encoding="utf-8-sig")
print(f"已输出聚合特征: {final_csv}")

完成 batch_001 -> E:\Dissertation\XGBoost_cleaning\2016north_Mask2Former_with_geobarriokilling_SelectedSegClass_Transtocsv\batch_001_features.csv, 共 200 行
完成 batch_002 -> E:\Dissertation\XGBoost_cleaning\2016north_Mask2Former_with_geobarriokilling_SelectedSegClass_Transtocsv\batch_002_features.csv, 共 200 行
完成 batch_003 -> E:\Dissertation\XGBoost_cleaning\2016north_Mask2Former_with_geobarriokilling_SelectedSegClass_Transtocsv\batch_003_features.csv, 共 200 行
完成 batch_004 -> E:\Dissertation\XGBoost_cleaning\2016north_Mask2Former_with_geobarriokilling_SelectedSegClass_Transtocsv\batch_004_features.csv, 共 200 行
完成 batch_005 -> E:\Dissertation\XGBoost_cleaning\2016north_Mask2Former_with_geobarriokilling_SelectedSegClass_Transtocsv\batch_005_features.csv, 共 200 行
完成 batch_006 -> E:\Dissertation\XGBoost_cleaning\2016north_Mask2Former_with_geobarriokilling_SelectedSegClass_Transtocsv\batch_006_features.csv, 共 200 行
完成 batch_007 -> E:\Dissertation\XGBoost_cleaning\2016north_Mask2Former_with_geobar

In [7]:
# ===== 记录日志 =====
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note):
    repo_dir = 'E:\Dissertation\CASA0004'
    log_path = f"{repo_dir}/operation_log.md"

    # 写入日志
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"**任务名称**: {task_name}\n")
        f.write(f"**任务文件**: {notebook_name}\n")
        f.write(f"**文件路径**: {notebook_path}\n")
        f.write(f"**数据集**: {dataset}\n")
        f.write(f"**代码版本**: {code_version}\n")
        f.write(f"**输入目录**: {input_dir}\n")
        f.write(f"**输出目录**: {output_dir}\n")
        f.write(f"**状态**: {status}\n")
        f.write(f"**耗时**: {duration}\n")
        f.write(f"**备注**: {note}\n")
        f.write(f"**记录时间**: {current_time}\n\n")
        f.write("================分割线================\n\n")

    print("✅ 日志写入完成")
status="完成，只输出了当前区块的brrio"
duration = "50s"
append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note)

✅ 日志写入完成


  repo_dir = 'E:\Dissertation\CASA0004'
