In [None]:
# (Objective3) 02_LPA_RealData
import networkx as nx
import networkx.algorithms.community as nx_comm  # Key: correctly import community algorithms module
import pandas as pd
import os
from collections import defaultdict
from community import community_louvain  # Only used to compute modularity Q
import time

# ==========================================
# 1. CONFIGURATION
# ==========================================

# Input paths
DATA_PATH = r"E:\Network Science Project\pypi_dag"
EDGES_FILE = os.path.join(DATA_PATH, "edges.csv")

# Output path (automatically create LPA_Analysis folder)
OUTPUT_ROOT_PATH = r"E:\Network Science Project\02_LPA_method\RealData"
os.makedirs(OUTPUT_ROOT_PATH, exist_ok=True)

# Visualization parameters
PAGERANK_SAMPLE_NODES = 200  # File 1: number of core subgraph nodes
TOP_COMMUNITIES_SAMPLE = 5   # File 2: number of top communities

# ==========================================
# 2. DATA LOADING & LPA CORE
# ==========================================

def load_graph(edges_file: str) -> nx.DiGraph:
    """Load graph data (parameter issue fixed)"""
    print(f"Loading graph from {edges_file}...")
    try:
        df_edges = pd.read_csv(edges_file)
        G = nx.from_pandas_edgelist(
            df_edges,
            source='source',
            target='target',
            create_using=nx.DiGraph()  # Fix 1: use create_using instead of create_graph
        )
        print(f"Graph loaded. Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")
        return G
    except Exception as e:
        print(f"Error reading CSV: {e}")
        raise

def run_lpa_community_detection(G: nx.DiGraph) -> tuple:
    """Run LPA algorithm and compute modularity"""
    print("Running Label Propagation Algorithm (LPA)...")
    
    # LPA usually performs best on undirected graphs
    G_undirected = G.to_undirected()
    
    # Fix 2: use the correct module path to call LPA
    # label_propagation_communities returns a generator of node sets
    lpa_gen = nx_comm.label_propagation_communities(G_undirected)
    
    # Convert set generator into {node: community_id} dictionary
    partition = {}
    for idx, community_nodes in enumerate(lpa_gen):
        for node in community_nodes:
            partition[node] = idx
            
    # Compute modularity (using Louvain library's evaluation function for convenience)
    # Note: modularity calculation may consume a lot of memory; comment out if OOM
    print("Calculating Modularity (Q)...")
    try:
        modularity_score = community_louvain.modularity(partition, G_undirected)
        print(f"LPA completed. Modularity Score (Q): {modularity_score:.4f}")
    except Exception as e:
        print(f"Warning: Could not calculate modularity (possibly OOM). Setting to 0. Error: {e}")
        modularity_score = 0.0
        
    return partition, modularity_score

def analyze_structure(G: nx.DiGraph, partition: dict, modularity_score: float) -> dict:
    """Analyze results, extract Top 10, and compute hubs"""
    print("Analyzing structure...")
    
    community_map = defaultdict(list)
    for node, comm_id in partition.items():
        community_map[comm_id].append(node)
        
    # Sort by community size
    sorted_communities = sorted(community_map.items(), key=lambda x: len(x[1]), reverse=True)
    
    # Compute PageRank for hub identification
    print("Calculating PageRank for Hub identification...")
    pagerank_scores = nx.pagerank(G, alpha=0.85)
    
    # Build Top 10 summary
    top_10_summary = []
    total_nodes = G.number_of_nodes()
    
    # Also build hubs dictionary for all communities (used for file 3 visualization)
    community_hubs = {}
    
    for comm_id, nodes in sorted_communities:
        # Get top 5 hubs for this community
        node_pr = [(n, pagerank_scores.get(n, 0)) for n in nodes]
        top_hubs = sorted(node_pr, key=lambda x: x[1], reverse=True)[:5]
        hubs_str = ', '.join([n for n, pr in top_hubs])
        community_hubs[comm_id] = hubs_str
        
    # Only extract Top 10 for reporting
    print("\n--- TOP 10 LPA STACKS ---")
    for rank, (comm_id, nodes) in enumerate(sorted_communities[:10]):
        size = len(nodes)
        share = (size / total_nodes) * 100
        
        stack_info = {
            'Rank': rank + 1,
            'ID': comm_id,
            'Size': size,
            'Share': share,
            'Core Hubs': community_hubs[comm_id]
        }
        top_10_summary.append(stack_info)
        print(f"Stack #{rank+1} (ID: {comm_id}): Size={size} ({share:.2f}%) -> Hubs: {stack_info['Core Hubs']}")

    return {
        "graph_name": "Real PyPI (LPA)",
        "modularity": modularity_score,
        "num_communities": len(sorted_communities),
        "partition": partition,
        "top_communities_summary": top_10_summary,
        "community_hubs": community_hubs,
        "pagerank_scores": pagerank_scores,
        "total_nodes": total_nodes,
        "full_sorted_communities": sorted_communities  # Used for file 2
    }

# ==========================================
# 3. VISUALIZATION EXPORT FUNCTIONS (FULL VERSION)
# ==========================================

def export_for_visualization(G: nx.DiGraph, partition: dict, output_path: str, filename: str):
    """Generic GEXF export function"""
    file_path = os.path.join(output_path, filename)
    print(f"  - Exporting GEXF: {filename}...")
    
    G_export = G.copy()
    nx.set_node_attributes(G_export, partition, name='CommunityID')
    
    # Simple edge type labeling
    edge_type = {}
    for u, v in G_export.edges():
        if partition.get(u) == partition.get(v):
            edge_type[(u, v)] = 'Internal'
        else:
            edge_type[(u, v)] = 'Inter-Community'
    nx.set_edge_attributes(G_export, edge_type, name='LinkType')
    
    nx.write_gexf(G_export, file_path)
    print(f"    Done. Saved to {file_path}")

def extract_core_subgraph(G, partition, pr_scores, num_nodes):
    """Strategy 1: extract PageRank-based core subgraph"""
    print(f"  - Extracting Top {num_nodes} PageRank nodes...")
    top_nodes = sorted(pr_scores, key=pr_scores.get, reverse=True)[:num_nodes]
    subgraph = G.subgraph(top_nodes).copy()
    # Add PageRank attribute to nodes for easier sizing in Gephi
    nx.set_node_attributes(subgraph, {n: pr_scores[n] for n in top_nodes}, name='PageRank')
    sub_partition = {n: partition[n] for n in top_nodes if n in partition}
    return subgraph, sub_partition

def build_community_graph(G, partition, community_hubs):
    """Strategy 2: build abstract community network (macro view)"""
    print("  - Building Abstract Community Graph...")
    G_comm = nx.DiGraph()
    comm_sizes = defaultdict(int)
    inter_edges = defaultdict(int)
    
    for n, comm_id in partition.items():
        comm_sizes[comm_id] += 1
        
    for u, v in G.edges():
        c_u, c_v = partition.get(u), partition.get(v)
        if c_u is not None and c_v is not None and c_u != c_v:
            inter_edges[(c_u, c_v)] += 1
            
    # Add nodes
    for comm_id, size in comm_sizes.items():
        G_comm.add_node(
            comm_id,
            size=size,
            label=f"Comm {comm_id}",
            Core_Hubs=community_hubs.get(comm_id, "N/A")
        )
        
    # Add edges
    for (u, v), w in inter_edges.items():
        G_comm.add_edge(u, v, weight=w)
        
    return G_comm

def save_txt_summary(results, filepath):
    """Save text summary"""
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(f"ANALYSIS SUMMARY (LPA Algorithm)\n")
        f.write(f"=================================\n")
        f.write(f"Modularity (Q): {results['modularity']:.4f}\n")
        f.write(f"Total Communities: {results['num_communities']}\n\n")
        f.write(f"TOP 10 STACKS:\n")
        for s in results['top_communities_summary']:
            f.write(
                f"Rank {s['Rank']} (ID {s['ID']}): "
                f"Size {s['Size']} ({s['Share']:.2f}%) - "
                f"Hubs: {s['Core Hubs']}\n"
            )
    print(f"Summary saved to {filepath}")

# ==========================================
# 4. MAIN EXECUTION
# ==========================================

if __name__ == "__main__":
    print("--- STARTING LPA ANALYSIS ---")
    
    try:
        # 1. Load data
        G_real = load_graph(EDGES_FILE)
        
        # 2. Run LPA
        partition_lpa, q_lpa = run_lpa_community_detection(G_real)
        
        # 3. Analyze structure
        results = analyze_structure(G_real, partition_lpa, q_lpa)
        
        # 4. Export results
        print("\n--- EXPORTING RESULTS ---")
        
        # A. Text summary
        save_txt_summary(results, os.path.join(OUTPUT_ROOT_PATH, "analysis_summary_lpa.txt"))
        
        # B. Strategy 1: PageRank core graph
        # Note: LPA results may be less regular than Louvain;
        # top 200 nodes may concentrate in a few large communities
        G_core, part_core = extract_core_subgraph(
            G_real,
            partition_lpa,
            results['pagerank_scores'],
            PAGERANK_SAMPLE_NODES
        )
        export_for_visualization(
            G_core,
            part_core,
            OUTPUT_ROOT_PATH,
            "1_pagerank_core_lpa.gexf"
        )
        
        # C. Strategy 2: abstract community graph
        G_abstract = build_community_graph(
            G_real,
            partition_lpa,
            results['community_hubs']
        )
        # For the abstract graph, the partition is the community ID itself
        abstract_part = {n: n for n in G_abstract.nodes()}
        export_for_visualization(
            G_abstract,
            abstract_part,
            OUTPUT_ROOT_PATH,
            "2_abstract_community_network_lpa.gexf"
        )
        
        print("\n LPA Analysis COMPLETE!")
        print(f"Files saved in: {OUTPUT_ROOT_PATH}")
        
    except Exception as e:
        print(f"\n FATAL ERROR: {e}")
        import traceback
        traceback.print_exc()


--- STARTING LPA ANALYSIS ---
Loading graph from E:\Network Science Project\pypi_dag\edges.csv...
Graph loaded. Nodes: 397798, Edges: 1819937
Running Label Propagation Algorithm (LPA)...
Calculating Modularity (Q)...
LPA completed. Modularity Score (Q): 0.1140
Analyzing structure...
Calculating PageRank for Hub identification...

--- TOP 10 LPA STACKS ---
Stack #1 (ID: 0): Size=343387 (86.32%) -> Hubs: numpy, typing-extensions, requests, colorama, six
Stack #2 (ID: 78): Size=18049 (4.54%) -> Hubs: odoo, python-stdnum, odoo14-addon-ssi-transaction-mixin, openupgradelib, odoo14-addon-ssi-master-data-mixin
Stack #3 (ID: 2): Size=6265 (1.57%) -> Hubs: django, djangorestframework, wagtail, django-cms, django-model-utils
Stack #4 (ID: 13): Size=1765 (0.44%) -> Hubs: typeguard, publication, jsii, constructs, aws-cdk-lib
Stack #5 (ID: 54): Size=748 (0.19%) -> Hubs: python-sdk-remote, octodns, shipyard-templates, ckantools, simplematch
Stack #6 (ID: 5): Size=666 (0.17%) -> Hubs: zope-interface,

In [None]:
# (Objective5) 02_LPA_RandomData
import pickle
import os
import networkx as nx
import pandas as pd
import networkx.algorithms.community as nx_comm
from community import community_louvain  # Used to compute modularity Q
import time
from collections import defaultdict

# ==========================================
# 1. CONFIGURATION
# ==========================================

# INPUT: Random graphs generated for Louvain baseline
BASE_INPUT_PATH = r"E:\Network Science Project\01_Louvain_method\RandomData"

# OUTPUT: LPA analysis results on random data
BASE_OUTPUT_PATH = r"E:\Network Science Project\02_LPA_method\RandomData"

# Number of files to process
NUM_GRAPHS = 5

# ==========================================
# 2. CORE FUNCTIONS
# ==========================================

def load_random_graph(file_path: str) -> nx.DiGraph:
    """Load a random graph in .pkl format"""
    print(f"Loading random graph from: {file_path}")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    with open(file_path, 'rb') as f:
        G = pickle.load(f)
    
    print(f"  Graph loaded. Nodes={G.number_of_nodes()}, Edges={G.number_of_edges()}")
    return G

def run_lpa_detection(G: nx.DiGraph):
    """Run the LPA algorithm"""
    print("  Running LPA (Label Propagation)...")
    
    # LPA requires an undirected graph
    G_undirected = G.to_undirected()
    
    # Run LPA
    lpa_gen = nx_comm.label_propagation_communities(G_undirected)
    
    # Convert to partition dictionary {node: community_id}
    partition = {}
    for idx, nodes in enumerate(lpa_gen):
        for node in nodes:
            partition[node] = idx
            
    # Compute modularity Q (using Louvain library standard)
    try:
        modularity = community_louvain.modularity(partition, G_undirected)
    except Exception:
        modularity = 0.0
        
    print(f"  LPA Complete. Modularity Q: {modularity:.4f}")
    return partition, modularity

def analyze_and_export(G, partition, modularity, output_dir, filename_suffix):
    """Analyze Top 10 and export summary and CSV"""
    
    # 1. Compute community sizes
    comm_map = defaultdict(list)
    for node, comm_id in partition.items():
        comm_map[comm_id].append(node)
        
    sorted_comms = sorted(comm_map.items(), key=lambda x: len(x[1]), reverse=True)
    total_nodes = G.number_of_nodes()
    
    # 2. Generate summary text
    summary_path = os.path.join(output_dir, f"analysis_summary_{filename_suffix}.txt")
    
    print(f"  Exporting summary to {summary_path}...")
    
    with open(summary_path, 'w', encoding='utf-8') as f:
        f.write(f"NETWORK ANALYSIS SUMMARY (LPA on Random Data)\n")
        f.write(f"=============================================\n")
        f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Modularity Score (Q): {modularity:.4f}\n")
        f.write(f"Total Communities: {len(sorted_comms)}\n\n")
        f.write(f"--- TOP 10 LARGEST STACKS (LPA) ---\n")
        
        for rank, (comm_id, nodes) in enumerate(sorted_comms[:10]):
            size = len(nodes)
            share = (size / total_nodes) * 100
            sample_nodes = list(nodes)[:5]
            f.write(
                f"Stack #{rank+1} (ID: {comm_id}): "
                f"Size={size} ({share:.2f}%) -> Sample Nodes: {sample_nodes}\n"
            )
            
    # 3. Export full partition CSV
    csv_path = os.path.join(output_dir, f"random_full_partition_{filename_suffix}.csv")
    print(f"  Exporting partition CSV to {csv_path}...")
    
    df = pd.DataFrame(list(partition.items()), columns=['Package', 'CommunityID'])
    df.to_csv(csv_path, index=False)

# ==========================================
# 3. MAIN EXECUTION
# ==========================================

if __name__ == "__main__":
    print("--- STARTING LPA ANALYSIS ON RANDOM DATA ---")
    
    random_files = [f"random_graph_{i}.pkl" for i in range(1, NUM_GRAPHS + 1)]
    
    for filename in random_files:
        print(f"\nProcessing {filename}...")
        
        # Input path
        input_path = os.path.join(BASE_INPUT_PATH, filename)
        
        # Output folder
        folder_name = filename.replace('.pkl', '_output')
        output_dir = os.path.join(BASE_OUTPUT_PATH, folder_name)
        
        os.makedirs(output_dir, exist_ok=True)
        
        try:
            # 1. Load graph
            G_rand = load_random_graph(input_path)
            
            # 2. Run LPA
            partition_lpa, q_lpa = run_lpa_detection(G_rand)
            
            # 3. Export results
            analyze_and_export(
                G_rand,
                partition_lpa,
                q_lpa,
                output_dir,
                "random_lpa"
            )
            
        except Exception as e:
            print(f" Error processing {filename}: {e}")

    print("\n All Random Data LPA analyses complete.")


--- STARTING LPA ANALYSIS ON RANDOM DATA ---

Processing random_graph_1.pkl...
Loading random graph from: E:\Network Science Project\01_Louvain_method\RandomData\random_graph_1.pkl
  Graph loaded. Nodes=397797, Edges=1819936
  Running LPA (Label Propagation)...
  LPA Complete. Modularity Q: 0.0038
  Exporting summary to E:\Network Science Project\02_LPA_method\RandomData\random_graph_1_output\analysis_summary_random_lpa.txt...
  Exporting partition CSV to E:\Network Science Project\02_LPA_method\RandomData\random_graph_1_output\random_full_partition_random_lpa.csv...

Processing random_graph_2.pkl...
Loading random graph from: E:\Network Science Project\01_Louvain_method\RandomData\random_graph_2.pkl
  Graph loaded. Nodes=397797, Edges=1819936
  Running LPA (Label Propagation)...
  LPA Complete. Modularity Q: 0.0035
  Exporting summary to E:\Network Science Project\02_LPA_method\RandomData\random_graph_2_output\analysis_summary_random_lpa.txt...
  Exporting partition CSV to E:\Network 