In [1]:
import pandas as pd
import json

def generate_hierarchical_json(csv_path, output_path):
    # 读取 CSV
    df = pd.read_csv(csv_path)
    result_map = {}

    # --- 1. 处理省 (Level: province) ---
    p_stats = df.groupby('province').agg({'revenue_rmb':'sum','sales_rmb':'sum','visits':'sum'}).reset_index()
    for _, row in p_stats.iterrows():
        name = row['province']
        level_df = df[df['province'] == name]
        result_map[name] = build_node(level_df, row)

    # --- 2. 处理市 (Level: city) ---
    c_stats = df.groupby(['province', 'city']).agg({'revenue_rmb':'sum','sales_rmb':'sum','visits':'sum'}).reset_index()
    for _, row in c_stats.iterrows():
        p_name, c_name = row['province'], row['city']
        full_key = f"{p_name}-{c_name}"
        level_df = df[(df['province'] == p_name) & (df['city'] == c_name)]
        
        node = build_node(level_df, row)
        result_map[full_key] = node
        if c_name not in result_map:
            result_map[c_name] = node

    # --- 3. 处理区 (Level: district) ---
    d_stats = df.groupby(['province', 'city', 'district']).agg({'revenue_rmb':'sum','sales_rmb':'sum','visits':'sum'}).reset_index()
    for _, row in d_stats.iterrows():
        p_name, c_name, d_name = row['province'], row['city'], row['district']
        full_key = f"{p_name}-{c_name}-{d_name}"
        level_df = df[(df['province'] == p_name) & (df['city'] == c_name) & (df['district'] == d_name)]
        
        node = build_node(level_df, row)
        result_map[full_key] = node
        if d_name not in result_map:
            result_map[d_name] = node

    # 保存 JSON
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result_map, f, ensure_ascii=False, indent=2)
    
    print(f"处理完成！JSON 已更新。总索引数: {len(result_map)}")

def build_node(level_df, row):
    """
    核心逻辑：提取品牌、SKU、渠道的详细分布及 Top 项目
    """
    rev = row['revenue_rmb']
    vis = row['visits']
    conv = round(rev / vis, 2) if vis > 0 else 0

    # 1. 提取品牌分布及 Top Brand
    brand_dict = level_df.groupby('brand')['revenue_rmb'].sum().to_dict()
    brand_dict = {k: round(v, 2) for k, v in brand_dict.items()}
    top_brand = max(brand_dict, key=brand_dict.get) if brand_dict else "-"

    # 2. 提取 SKU 分布及 Top SKU
    sku_dict = level_df.groupby('sku')['revenue_rmb'].sum().to_dict()
    sku_dict = {k: round(v, 2) for k, v in sku_dict.items()}
    top_sku = max(sku_dict, key=sku_dict.get) if sku_dict else "-"

    # 3. 提取渠道分布及 Top Channel
    channel_dict = level_df.groupby('channel')['revenue_rmb'].sum().to_dict()
    channel_dict = {k: round(v, 2) for k, v in channel_dict.items()}
    top_channel = max(channel_dict, key=channel_dict.get) if channel_dict else "-"

    return {
        "revenue": round(rev, 2),
        "sales": round(row['sales_rmb'], 2),
        "visits": int(vis),
        "conversion": conv,
        "top_brand": top_brand,
        "top_skus": top_sku,
        "top_channels": top_channel,
        "brands": brand_dict,
        "skus": sku_dict,
        "channels": channel_dict
    }

if __name__ == "__main__":
    
    generate_hierarchical_json('../data/sample_data.csv', '../data/business_data.json')

处理完成！JSON 已更新。总索引数: 189
