In [None]:
import networkx as nx
import pandas as pd
import os
from collections import defaultdict
from community import community_louvain 
import time

# ==========================================
# 1. CONFIGURATION (已修正输出路径)
# ==========================================

# 原始数据输入路径
# 根据你的截图，数据文件位于此路径
DATA_PATH = r"E:\Network Science Project\pypi_dag"
EDGES_FILE = os.path.join(DATA_PATH, "edges.csv") 

# 输出路径：将在 RealData 文件夹下新建一个 LPA_Analysis 文件夹
OUTPUT_ROOT_PATH = r"E:\Network Science Project\RealData\LPA_Analysis"
os.makedirs(OUTPUT_ROOT_PATH, exist_ok=True) 

# GEXF 可视化参数
PAGERANK_SAMPLE_NODES = 200 # 用于文件 1 的 PageRank 采样节点数

# ==========================================
# 2. CORE FUNCTIONS
# ==========================================

def load_graph(edges_file: str) -> nx.DiGraph:
    """从 edges.csv 文件加载有向图."""
    print(f"Loading graph from {edges_file}...")
    # 尝试从 CSV 加载边数据
    try:
        df_edges = pd.read_csv(edges_file)
        # 假设边文件包含 'source' 和 'target' 两列
        G = nx.from_pandas_edgelist(
            df_edges, 
            source='source', 
            target='target', 
            create_graph=nx.DiGraph()
        )
        print(f"Graph loaded. Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")
        return G
    except Exception as e:
        print(f"Error loading graph: {e}")
        raise FileNotFoundError(f"Could not load graph from {edges_file}")

def run_lpa_community_detection(G: nx.DiGraph) -> dict:
    """运行 Label Propagation Algorithm (LPA)"""
    print("Running Label Propagation Algorithm (LPA)...")
    
    # networkx 的 LPA 要求无向图
    lpa_partitions = nx.label_propagation_communities(G.to_undirected()) 
    
    partition = {}
    for i, nodes in enumerate(lpa_partitions):
        for node in nodes:
            partition[node] = i
    
    # 使用 Louvain 的模块度函数来评估 LPA 的结果
    modularity_score = community_louvain.modularity(partition, G)
    print(f"LPA completed. Modularity Score (Q): {modularity_score:.4f}")
    return partition, modularity_score

def analyze_community_structure(G: nx.DiGraph, partition: dict, modularity_score: float) -> dict:
    """分析 LPA 结果，提取 Top 10 Stacks 和摘要."""
    
    community_map = defaultdict(list)
    for node, comm_id in partition.items():
        community_map[comm_id].append(node)
    
    sorted_communities = sorted(community_map.items(), key=lambda x: len(x[1]), reverse=True)
    num_communities = len(sorted_communities)
    total_nodes = G.number_of_nodes()
    top_10_summary = []
    
    # 预计算 PageRank 用于 Hubs 识别
    print("Calculating PageRank for Core Hubs...")
    pagerank_scores = nx.pagerank(G, alpha=0.85)

    print("\n--- TOP 10 LARGEST STACKS ---")
    for rank, (comm_id, nodes) in enumerate(sorted_communities[:10]):
        size_nodes = len(nodes)
        share_percent = (size_nodes / total_nodes) * 100
        
        # 提取 Core Hubs (基于 PageRank)
        node_pr_tuples = [(node, pagerank_scores.get(node, 0)) for node in nodes]
        top_hubs = sorted(node_pr_tuples, key=lambda x: x[1], reverse=True)[:8]
        top_hubs_names = [name for name, pr in top_hubs]

        stack_info = {
            'Rank': rank + 1,
            'ID': comm_id,
            'Size': size_nodes,
            'Share': share_percent,
            'Core Hubs': ', '.join(top_hubs_names)
        }
        top_10_summary.append(stack_info)
        
        print(f"Stack #{rank+1} (ID: {comm_id}): Size={size_nodes} ({share_percent:.2f}%)")
        print(f"  Core Hubs: {stack_info['Core Hubs']}")
        
    results = {
        "graph_name": "Real PyPI Network (LPA)",
        "modularity": modularity_score,
        "num_communities": num_communities,
        "partition": partition,
        "top_communities_summary": top_10_summary, 
        "total_nodes": total_nodes,
        "pagerank_scores": pagerank_scores
    }
    return results

def save_analysis_summary(results: dict, G: nx.DiGraph, output_path: str):
    """将分析结果保存到文本文件."""
    
    total_share = sum(d['Share'] for d in results['top_communities_summary'])
    
    summary_text = f"""
==================================================
        NETWORK ANALYSIS SUMMARY (LPA)
==================================================
Date: {time.strftime("%Y-%m-%d %H:%M:%S")}
Graph Name: {results['graph_name']}

--- GRAPH DIAGNOSTICS ---
Nodes (Packages): {results['total_nodes']}
Edges (Dependencies): {G.number_of_edges()}
DAG Property: Valid (No cycles) 

--- COMMUNITY STRUCTURE (LPA) ---
Modularity Score (Q): {results['modularity']:.4f} 
Total Communities Found: {results['num_communities']}

--- TOP 10 LARGEST STACKS ---
(Total Share: {total_share:.2f}%)
"""
    
    for stack in results['top_communities_summary']:
        summary_text += f"""
Stack #{stack['Rank']} (ID: {stack['ID']}): Size={stack['Size']} ({stack['Share']:.2f}%)
  Core Hubs: {stack['Core Hubs']}
"""
    
    summary_text += "\n=================================================="

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(summary_text)
    
    print(f"\nAnalysis summary saved to: {output_path}")

# ==========================================
# 3. GEXF EXPORT FUNCTIONS (PLACEHOLDERS)
# ==========================================

# ⚠️ WARNING: 你必须在此处粘贴你原始脚本中用于生成 GEXF 文件的函数！
# 这些函数通常包括：
# 1. def build_pagerank_core(...)  # 用于生成 1_...gexf
# 2. def build_community_graph(...) # 用于生成 3_...gexf
# 如果你没有这些函数，GEXF 文件将不会生成。


# ==========================================
# 4. MAIN EXECUTION
# ==========================================

if __name__ == "__main__":
    
    # 1. 加载图
    try:
        G_real = load_graph(EDGES_FILE)
    except FileNotFoundError as e:
        print(f"FATAL ERROR: {e}")
        exit()
        
    # 2. 运行 LPA 社区检测
    partition_lpa, modularity_lpa = run_lpa_community_detection(G_real)
    
    # 3. 分析结果
    analysis_results = analyze_community_structure(G_real, partition_lpa, modularity_lpa)
    
    # 4. 保存摘要
    summary_output_path = os.path.join(OUTPUT_ROOT_PATH, "analysis_summary_lpa.txt")
    save_analysis_summary(analysis_results, G_real, summary_output_path)
    
    # 5. ⚠️ 调用 GEXF 导出函数 (需要你补充)
    # print("\nStarting GEXF export...")
    # if 'build_pagerank_core' in locals():
    #     build_pagerank_core(G_real, analysis_results['pagerank_scores'], analysis_results['partition'], OUTPUT_ROOT_PATH, PAGERANK_SAMPLE_NODES)
    # if 'build_community_graph' in locals():
    #     build_community_graph(G_real, analysis_results['partition'], OUTPUT_ROOT_PATH)
    
    print(f"\nLPA analysis complete. Results saved to: {OUTPUT_ROOT_PATH}")