In [5]:
# ========= Cell 1: 填写任务信息 =========
from datetime import datetime
import ipynbname
task_name = "calculate population on barrio"
notebook_name = "barrio_population_calculate.ipynb"  # 不带扩展名
notebook_path = "CASA0004\barrio_population_calculate.ipynb"  # 完整路径
dataset = "2016 Barrio & 2018 cencus on MANZANA"
code_version = "v1.0_G, (first part of geo cleaning)"
input_dir = r"E:\Dissertation\CASA0004\Barrios_will_UPZ.geojson" 
manzana_dir = r"E:\Dissertation\XGBoost_cleaning\MANZANA2018CENSO.geojson"    # 输入文件夹
output_dir = "E:/Dissertation/XGBoost_cleaning/barrio_with_population"   # 输出文件夹
note = "1.用barrio边界与manzana，按地理位置聚合，计算barrio的人口并且记录在\"population\"里面。2.如果街区完全包含了manzana，那就把全部的人口都算给它，如果没完全包含，就算[manzana总人口*(重叠面积/manzana的面积(这个数据记录在manzana的shape_area里面))]，四舍五入；3.完成后计算barrio的人口密度，为population/shape_area(这个是barrio数据集的shape_area)"

In [19]:
import geopandas as gpd
import pandas as pd
import os
from shapely.geometry import Polygon

In [23]:
# 输入文件
barrio_file = input_dir   # 你的 barrio 边界数据
manzana_file = manzana_dir  # manzana 数据（2018 人口普查）

# 输出文件
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "barrio_with_population.geojson")

# 读取数据
barrio = gpd.read_file(barrio_file)
manzana = gpd.read_file(manzana_file)

# 投影对齐（保证面积计算一致）
if barrio.crs != manzana.crs:
    manzana = manzana.to_crs(barrio.crs)

# 默认人口字段
pop_field = 'tp27_perso'

# 叠加分析：计算每个 manzana 与 barrio 的相交部分
intersections = gpd.overlay(manzana, barrio, how="intersection")

# 计算面积占比
intersections["area_intersect"] = intersections.geometry.area
intersections["area_ratio"] = intersections["area_intersect"] / intersections["shape_area_2"]
intersections["tp27_perso"] = pd.to_numeric(intersections["tp27_perso"], errors="coerce").fillna(0)
# 计算分配人口（四舍五入）
intersections["pop_allocated"] = (intersections[pop_field] * intersections["area_ratio"]).round()

# 聚合到 barrio 层级：求和
barrio_pop = intersections.groupby("barriocomu")["pop_allocated"].sum().reset_index()

# 合并回原始 barrio
barrio = barrio.merge(barrio_pop, on="barriocomu", how="left")

# 空值（没分配到人口的街区）设为 0
barrio["pop_allocated"] = barrio["pop_allocated"].fillna(0)

# 计算人口密度
barrio["pop_density"] = barrio["pop_allocated"] / barrio["shape_area"]

# 保存结果
barrio.to_file(output_file, driver="GeoJSON", encoding="utf-8")

print("✅ 已完成：人口和人口密度计算")
print(f"输出文件：{output_file}")



  intersections["area_intersect"] = intersections.geometry.area


✅ 已完成：人口和人口密度计算
输出文件：E:/Dissertation/XGBoost_cleaning/barrio_with_population\barrio_with_population.geojson


In [8]:
print(intersections.columns)

Index(['cod_dane_a', 'dpto_ccdgo', 'mpio_ccdgo', 'mpio_cdpmp', 'clas_ccdgo',
       'setr_ccdgo', 'setr_ccnct', 'secr_ccdgo', 'secr_ccnct', 'zu_ccdgo',
       ...
       'cod_loc', 'localidad', 'estado', 'barriocomu', 'cod_polbar',
       'shape_area_2', 'shape_len', 'codigo_upz', 'geometry',
       'area_intersect'],
      dtype='object', length=119)


In [12]:

intersections.to_csv("check_data.csv")

In [21]:
print(list(intersections.columns))


['cod_dane_a', 'dpto_ccdgo', 'mpio_ccdgo', 'mpio_cdpmp', 'clas_ccdgo', 'setr_ccdgo', 'setr_ccnct', 'secr_ccdgo', 'secr_ccnct', 'zu_ccdgo', 'zu_cdivi', 'setu_ccdgo', 'setu_ccnct', 'secu_ccdgo', 'secu_ccnct', 'manz_ccdgo', 'ag_ccdgo', 'dato_anm', 'version', 'area', 'latitud', 'longitud', 'densidad', 'ctnencuest', 'tp3_1_si', 'tp3_2_no', 'tp3a_ri', 'tp3b_tcn', 'tp4_1_si', 'tp4_2_no', 'tp9_1_uso', 'tp9_2_uso', 'tp9_3_uso', 'tp9_4_uso', 'tp9_2_1_mi', 'tp9_2_2_mi', 'tp9_2_3_mi', 'tp9_2_4_mi', 'tp9_2_9_mi', 'tp9_3_1_no', 'tp9_3_2_no', 'tp9_3_3_no', 'tp9_3_4_no', 'tp9_3_5_no', 'tp9_3_6_no', 'tp9_3_7_no', 'tp9_3_8_no', 'tp9_3_9_no', 'tp9_3_10_n', 'tp9_3_99_n', 'tvivienda', 'tp14_1_tip', 'tp14_2_tip', 'tp14_3_tip', 'tp14_4_tip', 'tp14_5_tip', 'tp14_6_tip', 'tp15_1_ocu', 'tp15_2_ocu', 'tp15_3_ocu', 'tp15_4_ocu', 'tp16_hog', 'tp19_ee_1', 'tp19_ee_2', 'tp19_ee_e1', 'tp19_ee_e2', 'tp19_ee_e3', 'tp19_ee_e4', 'tp19_ee_e5', 'tp19_ee_e6', 'tp19_ee_e9', 'tp19_acu_1', 'tp19_acu_2', 'tp19_alc_1', 'tp19_alc

In [10]:
# ===== 记录日志 =====
status="完成" 
duration = "1m57s"
def append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note):
    repo_dir = 'E:\Dissertation\CASA0004'
    log_path = f"{repo_dir}/operation_log.md"

    # 写入日志
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"**任务名称**: {task_name}\n")
        f.write(f"**数据集**: {dataset}\n")
        f.write(f"**代码版本**: {code_version}\n")
        f.write(f"**输入目录**: {input_dir}\n")
        f.write(f"**输出目录**: {output_dir}\n")
        f.write(f"**状态**: {status}\n")
        f.write(f"**耗时**: {duration}\n")
        f.write(f"**备注**: {note}\n\n")

    print("✅ 日志写入完成")
append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note)

✅ 日志写入完成


  repo_dir = 'E:\Dissertation\CASA0004'


In [None]:
# ===== 记录日志 =====
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note):
    repo_dir = 'E:\Dissertation\CASA0004'
    log_path = f"{repo_dir}/operation_log.md"

    # 写入日志
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"**任务名称**: {task_name}\n")
        f.write(f"**任务文件**: {notebook_name}\n")
        f.write(f"**文件路径**: {notebook_path}\n")
        f.write(f"**数据集**: {dataset}\n")
        f.write(f"**代码版本**: {code_version}\n")
        f.write(f"**输入目录**: {input_dir}\n")
        f.write(f"**输出目录**: {output_dir}\n")
        f.write(f"**状态**: {status}\n")
        f.write(f"**耗时**: {duration}\n")
        f.write(f"**备注**: {note}\n")
        f.write(f"**记录时间**: {current_time}\n\n")
        f.write("================分割线================\n\n")

    print("✅ 日志写入完成")
status="finished"
append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note)

✅ 日志写入完成


  repo_dir = 'E:\Dissertation\CASA0004'
