In [None]:
# ========= Cell 1: 填写任务信息 =========
from datetime import datetime
import ipynbname

task_name = "聚合垃圾与涂鸦文件"
notebook_name = "07_gg_cleaning.ipynb"  # 不带扩展名
notebook_path = "CASA0004\07_gg_cleaning.ipynb"  # 完整路径
dataset = "gg& map_name_geo& barrio_with_geo"
code_version = "v1.0_GG, (first part of GG cleaning)"

# ===== 使用 {} 作为占位符，替换为不同区块（例如 central, north, south） =====
input_dir = r"E:\Dissertation\XGBoost_cleaning\2016central_image_final_GG  # 输入文件夹
map_name_geo_dir = r"E:\Dissertation\XGBoost_cleaning\2016central_label_studio_config.json"  # map_json  
barrio_dir = r"E:/Dissertation/XGBoost_cleaning/Barrios_will_UPZ.geojson"
output_dir = r"E:\Dissertation\XGBoost_cleaning\2016central_GG_match_barrio_output"  # 输出文件夹

note = (
    "0.gg文件夹里面的json，分别新建graffiti_exist 和garbage_exist字段。"
    "如果文件中graffiti_count和garbage_count字段不为零，则对应的exist字段赋值为1，反之为0。"
    "2.按照gg文件夹里面的json文件名image_name与map_name_geo读取的json里面记录文件名(image)匹配"
    "(注意，这个字段记录的是路径，只能取后半段真正的文件名，例如batch_001/point_10_front_four_direction_fallback_...jpg)，"
    "并且给该json记录上里面的经纬度信息（longitude，latitude）。"
    "3.在上一步的json基础上，按照经纬度信息，给这些json和barrio的geojson位置取交集，"
    "判断它们属于哪个barrio，加上barrio字段（barrio的json里面记录名称的字段为：barriocomu）,和upz字段(codigo_upz)。"
    "这个操作结束后，带上前几步的信息为每张原始json在输出路径中生成新的json，要分批。"
    "4.按照barrio字段groupby，对garbage和graffiti的count和exist进行sum，输出一个csv。"
)


In [8]:
# ========= Cell 2: 主任务代码 =========
import os
import json
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import time

# 开始计时
start_time = time.time()
status = "running"

try:
    # 1. 读取 map_name_geo 配置文件
    with open(map_name_geo_dir, "r", encoding="utf-8") as f:
        map_data = json.load(f)
    
    # 建立 {image_filename: (lon, lat)} 映射
    map_dict = {}
    for record in map_data:
        image_path = record["data"]["image"]
        image_filename = os.path.basename(image_path)  # 取文件名
        lon = record["data"]["longitude"]
        lat = record["data"]["latitude"]
        map_dict[image_filename] = (lon, lat)
    
    # 2. 读取 barrio geojson
    barrio_gdf = gpd.read_file(barrio_dir)
    barrio_gdf = barrio_gdf.to_crs("EPSG:4326")  # 确保是经纬度坐标系
    
    # 准备输出文件夹
    os.makedirs(output_dir, exist_ok=True)

    # 存储处理后的结果，用于汇总
    records_for_csv = []

    # 3. 遍历输入文件夹及子文件夹所有 json
    for root, dirs, files in os.walk(input_dir):
        for fname in files:
            if not fname.endswith(".json"):
                continue
            
            fpath = os.path.join(root, fname)
            with open(fpath, "r", encoding="utf-8") as f:
                data = json.load(f)
            
            # Step 1: 添加 graffiti_exist 和 garbage_exist
            graffiti_count = data.get("graffiti_count", 0)
            garbage_count = data.get("garbage_count", 0)
            data["graffiti_exist"] = 1 if graffiti_count > 0 else 0
            data["garbage_exist"] = 1 if garbage_count > 0 else 0

            # Step 2: 匹配 map_name_geo 获取坐标
            image_name = data["image_name"]
            if image_name in map_dict:
                lon, lat = map_dict[image_name]
                data["longitude"] = lon
                data["latitude"] = lat

                # Step 3: 空间匹配 barrio
                point = Point(lon, lat)
                match = barrio_gdf[barrio_gdf.contains(point)]
                if not match.empty:
                    data["barrio"] = match.iloc[0]["barriocomu"]
                    data["upz"] = match.iloc[0]["codigo_upz"]
                else:
                    data["barrio"] = None
                    data["upz"] = None
            else:
                data["longitude"] = None
                data["latitude"] = None
                data["barrio"] = None
                data["upz"] = None
            
            # 输出路径保持子文件夹结构
            rel_path = os.path.relpath(root, input_dir)
            out_dir = os.path.join(output_dir, rel_path)
            os.makedirs(out_dir, exist_ok=True)
            out_path = os.path.join(out_dir, fname)

            # 保存新 json
            with open(out_path, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            
            # 保存到汇总表
            records_for_csv.append({
                "barrio": data["barrio"],
                "upz": data["upz"],
                "graffiti_count": graffiti_count,
                "garbage_count": garbage_count,
                "graffiti_exist": data["graffiti_exist"],
                "garbage_exist": data["garbage_exist"]
            })

    # 4. 汇总统计
    df = pd.DataFrame(records_for_csv)
    summary = df.groupby(["barrio", "upz"], dropna=False).sum(numeric_only=True).reset_index()

    # 保存汇总 CSV
    csv_path = os.path.join(output_dir, "GG_summary_{}.csv".format("{}"))
    summary.to_csv(csv_path, index=False, encoding="utf-8-sig")

    status = "success"

except Exception as e:
    status = "failed"
    note += f" | Error: {str(e)}"

finally:
    # 计算耗时
    duration = round(time.time() - start_time, 2)
    print(f"任务完成，状态: {status}, 耗时: {duration} 秒")


任务完成，状态: failed, 耗时: 0.0 秒
