In [4]:
# (Objective3) 01_Louvain_RealData
import pandas as pd
import networkx as nx
import community.community_louvain as community_louvain
import os
from collections import defaultdict
import random
import gc

# ==========================================
# 1. CONFIGURATION SECTION (real data)
# ==========================================

DATA_PATH = r"E:\Network Science Project\pypi_dag" 
EDGES_FILE_NAME = "edges.csv"
OUTPUT_ROOT_PATH = r"E:\Network Science Project\01_Louvain_method\RealData"

# --- Visualization Sampler Settings ---
# PageRank Sample Size: Extracts the N most influential nodes (for showing Macro Authority Structure)
PAGERANK_SAMPLE_NODES = 200 
# Core Communities Sample Size: Extracts all nodes belonging to the TOP K largest communities 
TOP_COMMUNITIES_SAMPLE = 5 
# ------------------------------------

# ==========================================
# 2. UTILITIES
# ==========================================

def load_data(data_dir: str, filename: str) -> nx.DiGraph:
    """Loads data, handles pathing, and builds the directed graph."""
    file_path = os.path.join(data_dir, filename) 
    print(f"\nLoading data from: {file_path}")
    if not os.path.exists(file_path):
        if not filename.endswith('.csv') and os.path.exists(os.path.join(data_dir, filename + '.csv')):
             file_path = os.path.join(data_dir, filename + '.csv')
        else:
             raise FileNotFoundError(f" Error: Could not find file at {file_path}. Check DATA_PATH and EDGES_FILE_NAME.")
    
    edges_df = pd.read_csv(file_path, usecols=['source', 'target'])
    G = nx.from_pandas_edgelist(
        edges_df, source='source', target='target', create_using=nx.DiGraph()
    )
    return G

def export_partition_to_csv(partition: dict, output_path: str, filename: str = "pypi_full_partition_realdata.csv"):
    """
    Exports the complete community partition to the specified CSV file.
    """
    file_path = os.path.join(output_path, filename)
    print(f"\n[Data Export] Saving full partition to {file_path}...")
    
    partition_df = pd.DataFrame(
        list(partition.items()), 
        columns=['PackageName', 'CommunityID']
    )
    
    partition_df.to_csv(file_path, index=False)
    print(f" Full partition CSV saved.")

def export_top_stacks_table(G: nx.DiGraph, sorted_communities: list, top_k: int, output_path: str, filename: str = "top_5_stacks_summary.csv"):
    """
    Exports the core data for the Top K Stacks as a CSV table for reporting (Method 2 Table).
    """
    file_path = os.path.join(output_path, filename)
    print(f"\n[Data Export] Generating Top {top_k} Stacks summary table to {file_path}...")
    
    table_data = []
    num_nodes = G.number_of_nodes()
    
    for i, (comm_id, nodes) in enumerate(sorted_communities[:top_k]):
        top_hubs = sorted(nodes, key=lambda x: G.in_degree(x), reverse=True)[:8]
        share = (len(nodes) / num_nodes) * 100
        
        table_data.append({
            'Stack #': i + 1,
            'Community ID': comm_id,
            'Size (Nodes)': len(nodes),
            'Share (%)': f'{share:.2f}',
            'Core Packages (Hubs)': ', '.join(top_hubs)
        })
        
    df_summary = pd.DataFrame(table_data)
    df_summary.to_csv(file_path, index=False)
    print(f" Top Stacks Summary CSV saved.")

# ==========================================
# FUNCTION 1: CORE ANALYSIS LOGIC
# ==========================================

def analyze_community_structure(G, graph_name="Unknown Graph", top_k_stacks=5):
    """
    Function 1: Objective 3 Core Logic
    
    This function performs community detection on a given directed graph (G).
    It follows the standard pipeline: 
    Check Stats -> Undirected Projection -> Louvain Algorithm -> Result Parsing.

    Args:
        G (nx.DiGraph): The input directed graph (Real or Random).
        graph_name (str): A label for printing (e.g., "Real PyPI", "Random #1").
        top_k_stacks (int): How many top communities to display in detail.

    Returns:
        dict: A dictionary containing the results:
              - 'modularity': The Q score (float).
              - 'num_communities': Total communities found (int).
              - 'partition': The raw partition dict {node: community_id}.
    """
    
    print(f"\n{'='*20} Analyzing: {graph_name} {'='*20}")

    # ---------------------------------------------------------
    # Step 1: Basic Graph Diagnostics
    # ---------------------------------------------------------
    # Basic statistics
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    print(f"[Step 1] Graph Diagnostics:")
    print(f"  - Nodes: {num_nodes}")
    print(f"  - Edges: {num_edges}")

    # Check for DAG property (Cycle detection)
    # Note: While Louvain works on undirected graphs, verifying the DAG property
    # is crucial for the integrity of the dependency network dataset.
    if nx.is_directed_acyclic_graph(G):
        print(f"  - DAG Property: Valid (No cycles found)")
    else:
        # If it's a random graph, strict DAG compliance depends on the generation method.
        # We print a warning but proceed, as Louvain doesn't require acyclicity.
        print(f"  - DAG Property: Cycles detected")

    # ---------------------------------------------------------
    # Step 2: Undirected Projection
    # ---------------------------------------------------------
    # Louvain algorithm maximizes modularity on Undirected Graphs.
    # We project the directed dependencies (A->B) to a mutual connection (A-B).
    # This captures the "semantic relationship" between packages.
    print(f"[Step 2] Creating Undirected Projection...")
    G_undirected = G.to_undirected()

    # ---------------------------------------------------------
    # Step 3: Louvain Optimization
    # ---------------------------------------------------------
    print(f"[Step 3] Running Louvain Algorithm (optimizing Modularity)...")
    
    # partition is a dict: {node_name: community_id}
    # This function iteratively optimizes the Modularity score (Q).
    try:
        partition = community_louvain.best_partition(G_undirected)
    except ValueError as e:
        print(f" Error during Louvain execution: {e}")
        return None

    # Calculate the final Modularity Score (Q)
    # Range: [-0.5, 1.0]. Higher Q means stronger community structure.
    modularity_score = community_louvain.modularity(partition, G_undirected)
    print(f" Optimization Complete.")
    print(f" Modularity Score (Q): {modularity_score:.4f}")

    # ---------------------------------------------------------
    # Step 4: Community / Stack Analysis
    # ---------------------------------------------------------
    print(f"[Step 4] Extracting Technology Stacks...")

    # Group nodes by their community ID
    community_map = defaultdict(list)
    for node, comm_id in partition.items():
        community_map[comm_id].append(node)
    
    # Sort communities by size (largest first)
    sorted_communities = sorted(community_map.items(), key=lambda x: len(x[1]), reverse=True)
    num_communities = len(sorted_communities)
    print(f"  - Total Communities Found: {num_communities}")

    print(f"  - Details of Top {top_k_stacks} Largest Communities (Potential Tech Stacks):")
    
    for i, (comm_id, nodes) in enumerate(sorted_communities[:top_k_stacks]):
        # To name a stack, we look for "Hub" nodes (High In-Degree in the ORIGINAL DiGraph).
        # High In-Degree = Highly depended upon = Core package of that stack.
        # We define 'importance' by In-Degree here.
        top_hubs = sorted(nodes, key=lambda x: G.in_degree(x), reverse=True)[:8]
        
        # Calculate percentage of total graph
        share = (len(nodes) / num_nodes) * 100
        
        print(f" [Stack #{i+1}] ID: {comm_id} | Size: {len(nodes)} ({share:.2f}%)")
        print(f"  -> Core Packages (Hubs): {', '.join(top_hubs)}")

    # ---------------------------------------------------------
    # Return Results Package
    # ---------------------------------------------------------
    results = {
        "graph_name": graph_name,
        "modularity": modularity_score,
        "num_communities": num_communities,
        "partition": partition,
        "top_communities_summary": sorted_communities[:top_k_stacks]
    }
    
    return results


# ==========================================
# 4. VISUALIZATION PREPARATION FUNCTIONS
# ==========================================

def extract_core_subgraph(G: nx.DiGraph, partition: dict, num_nodes: int) -> tuple[nx.DiGraph, dict]:
    """
    Method 1: Extracts the core subgraph based on PageRank authority (Top N nodes).
    Used to visualize the macro-structure and distribution of authoritative packages.
    """
    print(f"\n[Visualization Prep] Calculating PageRank and extracting top {num_nodes} core nodes...")
    
    # Calculate PageRank (measures a node's influence/authority)
    pagerank_scores = nx.pagerank(G, alpha=0.85)
    top_nodes = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)[:num_nodes]
    
    # Induced subgraph: only includes selected nodes and the edges between them
    core_subgraph = G.subgraph(top_nodes).copy()
    
    # Create the partition dictionary for the subgraph
    core_partition = {node: partition[node] for node in core_subgraph.nodes() if node in partition}
    
    print(f"  - Core Subgraph Extracted (PageRank): Nodes={core_subgraph.number_of_nodes()}, Edges={core_subgraph.number_of_edges()}")
    return core_subgraph, core_partition


def extract_top_community_subgraph(G: nx.DiGraph, sorted_communities: list, top_k: int) -> nx.DiGraph:
    """
    Method 2: Extracts the subgraph containing only the top K largest communities.
    NOTE: This function should only return the graph object.
    """
    print(f"\n[Visualization Prep] Extracting Top {top_k} largest communities...")
    
    top_k_nodes = set()
    for _, nodes in sorted_communities[:top_k]:
        top_k_nodes.update(nodes)
        
    top_comm_subgraph = G.subgraph(top_k_nodes).copy()
    
    print(f"  - Top Communities Subgraph Extracted: Nodes={top_comm_subgraph.number_of_nodes()}, Edges={top_comm_subgraph.number_of_edges()}")
    return top_comm_subgraph


def build_community_graph(G: nx.DiGraph, partition: dict) -> nx.DiGraph:
    """
    Model2.2: construct abstract community network (Community Graph)ã€‚
    each node: one community; each edge: the number of dependencies between communities
    """
    print("\n[Visualization Prep] Building abstract Community Graph (Strategy 2)...")
    
    G_comm = nx.DiGraph()
    community_sizes = defaultdict(int)
    inter_community_edges = defaultdict(int)
    
    # 1. calculate community size and edge weight between communities
    for node, comm_id in partition.items():
        community_sizes[comm_id] += 1
        
    for u, v in G.edges():
        if u in partition and v in partition:
            comm_u = partition[u]
            comm_v = partition[v]
            
            # dependencies in diff comm
            if comm_u != comm_v:
                inter_community_edges[(comm_u, comm_v)] += 1
                
    # 2. construct G_comm
    
    # add notes (CommunityNode)
    for comm_id, size in community_sizes.items():
        G_comm.add_node(
            comm_id, 
            size=size,
            label=f'Community {comm_id}',
            type='CommunityNode'
        )
        
    # add edges (InterCommunityDependency)
    for (comm_u, comm_v), weight in inter_community_edges.items():
        G_comm.add_edge(
            comm_u, 
            comm_v, 
            weight=weight,
            type='InterCommunityDependency'
        )
        
    print(f"  - Community Graph built. Nodes={G_comm.number_of_nodes()}, Edges={G_comm.number_of_edges()}")
    return G_comm

def export_for_visualization(G: nx.DiGraph, partition: dict, output_path: str, filename: str):
    """
    Prepares the graph by adding necessary node/edge attributes and exports it to Gephi (.gexf format).
    """
    file_path = os.path.join(output_path, filename)
    
    G_export = G.copy()
    
    # set node attributes: CommunityID
    nx.set_node_attributes(G_export, partition, name='CommunityID')
    
    # set edge attributes: LinkType (Internal/Inter-Community)
    edge_type = {}
    for u, v in G_export.edges():
        u_comm = partition.get(u, -1)
        v_comm = partition.get(v, -1)
        
        if u_comm == v_comm and u_comm != -1:
            edge_type[(u, v)] = 'Internal'
        else:
            edge_type[(u, v)] = 'Inter-Community'
            
    nx.set_edge_attributes(G_export, edge_type, name='LinkType')
    
    # export to GEXF
    nx.write_gexf(G_export, file_path)
    print(f" Exported: {filename} (Nodes: {G_export.number_of_nodes()}, Edges: {G_export.number_of_edges()})")


# ==========================================
# 5. MAIN EXECUTION (HOW TO RUN)
# ==========================================

def export_analysis_summary(G: nx.DiGraph, results: dict, output_path: str, filename: str = "analysis_summary.txt"):
    """
    Exports all key summary data (Q, diagnostics, Top 10 Hubs) to a single TXT file.
    """
    file_path = os.path.join(output_path, filename)
    print(f"\n[Data Export] Generating Analysis Summary to {file_path}...")

    # Top 10 Stacks' Hubs info
    stacks_text = "\n"
    num_nodes = G.number_of_nodes()
    
    for i, (comm_id, nodes) in enumerate(results['top_communities_summary'][:10]):
        top_hubs = sorted(nodes, key=lambda x: G.in_degree(x), reverse=True)[:8]
        share = (len(nodes) / num_nodes) * 100
        
        stacks_text += f"Stack #{i+1} (ID: {comm_id}): Size={len(nodes)} ({share:.2f}%)\n"
        stacks_text += f"  Core Hubs: {', '.join(top_hubs)}\n"
    
    summary_content = f"""
==================================================
        NETWORK ANALYSIS SUMMARY
==================================================
Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
Graph Name: {results['graph_name']}

--- GRAPH DIAGNOSTICS ---
Nodes (Packages): {G.number_of_nodes()}
Edges (Dependencies): {G.number_of_edges()}
DAG Property: {'Valid (No cycles)' if nx.is_directed_acyclic_graph(G) else 'Cycles Detected'}

--- COMMUNITY STRUCTURE (Louvain) ---
Modularity Score (Q): {results['modularity']:.4f}  <-- Objective 5 BASELINE
Total Communities Found: {results['num_communities']}

--- TOP 10 LARGEST STACKS ---
{stacks_text}
==================================================
"""
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(summary_content)
    print(f" Analysis Summary TXT saved.")


if __name__ == "__main__":
    
    os.makedirs(OUTPUT_ROOT_PATH, exist_ok=True)
    print("--- Starting Advanced Analysis and Automated Export ---")
    
    try:
        # 1. Load Data
        G_real = load_data(DATA_PATH, EDGES_FILE_NAME)

        # 2. Call Function 1: Analyze Community Structure (Objective 3)
        real_results = analyze_community_structure(
            G_real, 
            graph_name="REAL PyPI Dependency Network",
            top_k_stacks=10 # Console prints Top 10
        )
        partition = real_results['partition']

        # -----------------------------------------------------------------
        # 3. Export: Summary & Tables
        # -----------------------------------------------------------------
        
        # 3.1 Export Summary.txt (Q value)
        export_analysis_summary(G_real, real_results, OUTPUT_ROOT_PATH)
        
        # 3.2 Export full partition CSV
        export_partition_to_csv(partition, OUTPUT_ROOT_PATH)
        
        # 3.3 partition Top 5 Stacks CSV 
        export_top_stacks_table(G_real, real_results['top_communities_summary'], TOP_COMMUNITIES_SAMPLE, OUTPUT_ROOT_PATH, filename="top_5_stacks_summary.csv")
        
        # -----------------------------------------------------------------
        # 4. VISUALIZATION PREPARATION (GEXF export)
        # -----------------------------------------------------------------
        print("\n[--- Starting Visualization Exports ---]")

        # 4.1 Sample A: PageRank core subgraph (Model 1 - core nodes)
        core_G, core_partition = extract_core_subgraph(
            G_real, 
            partition, 
            num_nodes=PAGERANK_SAMPLE_NODES
        )
        export_for_visualization(
            core_G, 
            core_partition, 
            OUTPUT_ROOT_PATH, 
            filename="1_pagerank_core_for_gephi.gexf"
        )

        # 4.2 Sample B: Top 5 communities subgraph (whole communities nodes)
        top_comm_G = extract_top_community_subgraph(
            G_real, 
            real_results['top_communities_summary'], 
            top_k=TOP_COMMUNITIES_SAMPLE
        )
        top_comm_partition = {node: partition[node] for node in top_comm_G.nodes() if node in partition}

        export_for_visualization(
            top_comm_G, 
            top_comm_partition, 
            OUTPUT_ROOT_PATH, 
            filename="2_top_5_communities_for_gephi.gexf" 
        )
        
        # 4.3 Sample C: abstract community graph (Model2 macro structure)
        G_community = build_community_graph(G_real, partition)
        
        # Export G_community (use self index partition)
        community_partition = {node: node for node in G_community.nodes()}
        
        export_for_visualization(
            G_community, 
            community_partition, 
            OUTPUT_ROOT_PATH, 
            filename="3_abstract_community_network.gexf"
        )
        print("[--- Visualization Exports Complete ---]")

    except Exception as e:
        print(f"\n--- Analysis Failed ---")
        print(f"Error encountered: {e}")
        exit()

    print("\n" + "="*50)
    print(" ALL EXPORTS COMPLETE")
    print("==================================================")



--- Starting Advanced Analysis and Automated Export ---

Loading data from: E:\Network Science Project\pypi_dag\edges.csv

[Step 1] Graph Diagnostics:
  - Nodes: 397798
  - Edges: 1819937
  - DAG Property: Valid (No cycles found)
[Step 2] Creating Undirected Projection...
[Step 3] Running Louvain Algorithm (optimizing Modularity)...
 Optimization Complete.
 Modularity Score (Q): 0.4755
[Step 4] Extracting Technology Stacks...
  - Total Communities Found: 1540
  - Details of Top 10 Largest Communities (Potential Tech Stacks):
 [Stack #1] ID: 6 | Size: 80618 (20.27%)
  -> Core Packages (Hubs): numpy, pandas, matplotlib, scipy, scikit-learn, seaborn, opencv-python, networkx
 [Stack #2] ID: 4 | Size: 57375 (14.42%)
  -> Core Packages (Hubs): pydantic, tqdm, rich, torch, python-dotenv, httpx, typer, fastapi
 [Stack #3] ID: 194 | Size: 47400 (11.92%)
  -> Core Packages (Hubs): pyyaml, click, jinja2, psutil, boto3, colorama, tabulate, toml
 [Stack #4] ID: 1 | Size: 45894 (11.54%)
  -> Core Pa

In [2]:
# (Objective5) 01_Louvain_RandomData
import pickle
import os
import networkx as nx
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import community.community_louvain as community_louvain 

# ==========================================
# 1. CONFIGURATION SECTION 
# ==========================================
BASE_OUTPUT_PATH = r"E:\Network Science Project\01_Louvain_method\RandomData"
REAL_DATA_MODULARITY_Q = 0.4783 # Q value from real data

# sample set
TOP_COMMUNITIES_SAMPLE = 5
PAGERANK_SAMPLE_NODES = 100 

# ==========================================
# 2. UTILITIES
# ==========================================

def load_random_graph(file_path: str) -> nx.DiGraph:
    """Loads a NetworkX graph object from a .pkl file."""
    print(f"\nLoading random graph from: {file_path}")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f" Error: Random graph file NOT FOUND at {file_path}")
    
    try:
        with open(file_path, 'rb') as f:
            G_random = pickle.load(f)
            if isinstance(G_random, nx.DiGraph) or isinstance(G_random, nx.Graph):
                 print(f"Graph loaded. Nodes={G_random.number_of_nodes()}, Edges={G_random.number_of_edges()}")
                 return G_random
            else:
                 raise TypeError("File content is not a NetworkX graph object.")
    except Exception as e:
        print(f" Error loading pickle file: {e}")
        raise

def export_analysis_summary(G: nx.DiGraph, results: dict, output_path: str, filename: str = "analysis_summary.txt"):
    """
    Exports all key summary data (Q, diagnostics, Top 10 Hubs) to a single TXT file.
    """
    file_path = os.path.join(output_path, filename)
    print(f"  - Generating Analysis Summary to {file_path}...")

    #  Top 10 Stacks' Hubs 
    stacks_text = "\n"
    num_nodes = G.number_of_nodes()
    
    top_comms = results.get('top_communities_summary', [])
    
    for i, (comm_id, nodes) in enumerate(top_comms[:10]):
        top_hubs = sorted(nodes, key=lambda x: G.in_degree(x), reverse=True)[:8]
        share = (len(nodes) / num_nodes) * 100
        
        stacks_text += f"Stack #{i+1} (ID: {comm_id}): Size={len(nodes)} ({share:.2f}%)\n"
        stacks_text += f"  Core Hubs: {', '.join(top_hubs)}\n"
    
    summary_content = f"""
==================================================
        NETWORK ANALYSIS SUMMARY (RANDOM DATA)
==================================================
Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
Graph Name: {results.get('graph_name', 'Unknown')}

--- GRAPH DIAGNOSTICS ---
Nodes (Packages): {G.number_of_nodes()}
Edges (Dependencies): {G.number_of_edges()}
DAG Property: {'Valid (No cycles)' if nx.is_directed_acyclic_graph(G) else 'Cycles Detected'}

--- COMMUNITY STRUCTURE (Louvain) ---
Modularity Score (Q): {results['modularity']:.4f}
Total Communities Found: {results['num_communities']}

--- TOP 10 LARGEST STACKS (RANDOM STRUCTURE) ---
{stacks_text}
==================================================
"""
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(summary_content)
    print(f"  Analysis Summary TXT saved.")

def export_partition_to_csv(partition: dict, output_path: str, filename: str = "random_full_partition.csv"):
    """Exports the complete node-to-community partition to a CSV file."""
    file_path = os.path.join(output_path, filename)
    
    df_partition = pd.DataFrame(
        list(partition.items()), 
        columns=['Package', 'CommunityID']
    )
    df_partition.to_csv(file_path, index=False)
    print(f"  Full partition CSV saved to: {file_path}")

def export_top_stacks_table(G: nx.DiGraph, sorted_communities: list, top_k: int, output_path: str, filename: str):
    """Exports the core data for the Top K Stacks as a CSV table."""
    file_path = os.path.join(output_path, filename)
    print(f"  - Generating Top {top_k} Stacks summary table...")
    
    table_data = []
    num_nodes = G.number_of_nodes()
    
    for i, (comm_id, nodes) in enumerate(sorted_communities[:top_k]):
        top_hubs = sorted(nodes, key=lambda x: G.in_degree(x), reverse=True)[:8]
        share = (len(nodes) / num_nodes) * 100
        
        table_data.append({
            'Stack #': i + 1,
            'Community ID': comm_id,
            'Size (Nodes)': len(nodes),
            'Share (%)': f'{share:.2f}',
            'Core Packages (Hubs)': ', '.join(top_hubs)
        })
        
    df_summary = pd.DataFrame(table_data)
    df_summary.to_csv(file_path, index=False)
    print(f"  Top Stacks Summary CSV saved.")

def extract_core_subgraph(G: nx.DiGraph, partition: dict, num_nodes: int) -> tuple:
    """Extracts the core subgraph based on PageRank authority."""
    print(f"  - [Vis Prep] Calculating PageRank (Top {num_nodes})...")
    pagerank_scores = nx.pagerank(G, alpha=0.85)
    top_nodes = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)[:num_nodes]
    
    core_subgraph = G.subgraph(top_nodes).copy()
    core_partition = {node: partition[node] for node in core_subgraph.nodes() if node in partition}
    return core_subgraph, core_partition

def extract_top_community_subgraph(G: nx.DiGraph, sorted_communities: list, top_k: int) -> nx.DiGraph:
    """Extracts the subgraph containing only the top K largest communities."""
    print(f"  - [Vis Prep] Extracting Top {top_k} largest communities...")
    top_k_nodes = set()
    for _, nodes in sorted_communities[:top_k]:
        top_k_nodes.update(nodes)
        
    top_comm_subgraph = G.subgraph(top_k_nodes).copy()
    return top_comm_subgraph

def export_for_visualization(G: nx.DiGraph, partition: dict, output_path: str, filename: str):
    """Exports to Gephi (.gexf)."""
    file_path = os.path.join(output_path, filename)
    G_export = G.copy()
    nx.set_node_attributes(G_export, partition, name='CommunityID')
    
    edge_type = {}
    for u, v in G_export.edges():
        u_comm = partition.get(u, -1)
        v_comm = partition.get(v, -1)
        if u_comm == v_comm and u_comm != -1:
            edge_type[(u, v)] = 'Internal'
        else:
            edge_type[(u, v)] = 'Inter-Community'
            
    nx.set_edge_attributes(G_export, edge_type, name='LinkType')
    nx.write_gexf(G_export, file_path)
    print(f"   Exported GEXF: {filename}")

# ==========================================
# 3. CORE ANALYSIS LOGIC 
# ==========================================

def analyze_community_structure(G: nx.DiGraph, graph_name: str = "Unknown Graph", top_k_stacks: int = 10) -> dict:
    """
    Performs community detection using Louvain algorithm.
    """
    print(f"\n{'='*20} Analyzing: {graph_name} {'='*20}")
    
    # 1. Undirected Projection
    G_undirected = G.to_undirected()
    
    # 2. Louvain Algorithm
    try:
        partition = community_louvain.best_partition(G_undirected)
    except ValueError as e:
        print(f"  Error during Louvain execution: {e}")
        return {}

    # 3. Modularity
    modularity_score = community_louvain.modularity(partition, G_undirected)
    print(f"  Optimization Complete. Q Score: {modularity_score:.4f}")
    
    # 4. Stats
    community_map = defaultdict(list)
    for node, comm_id in partition.items():
        community_map[comm_id].append(node)
    
    sorted_communities = sorted(community_map.items(), key=lambda x: len(x[1]), reverse=True)
    num_communities = len(sorted_communities)
    print(f"  - Total Communities Found: {num_communities}")

    return {
        "graph_name": graph_name,
        "modularity": modularity_score,
        "num_communities": num_communities,
        "partition": partition,
        "top_communities_summary": sorted_communities 
    }

# ==========================================
# 4. MAIN EXECUTION
# ==========================================

if __name__ == "__main__":
    
    random_files = [f"random_graph_{i}.pkl" for i in range(1, 6)]
    all_modularity_scores = []
    
    print("--- Starting Random Graph Baseline Analysis (Objective 5) ---")
    
    for filename in random_files:
        # --- 1. path set and outout file set ---
        folder_name = filename.replace('.pkl', '_output')
        output_dir = os.path.join(BASE_OUTPUT_PATH, folder_name)
        
        os.makedirs(output_dir, exist_ok=True)
        # print(f"\nCreated output directory: {output_dir}")
        
        full_path = os.path.join(BASE_OUTPUT_PATH, filename) 
        
        try:
            # 2. loading random graph
            G_random = load_random_graph(full_path)

            # 3. run function 1
            random_results = analyze_community_structure(
                G_random, 
                graph_name=f"Random Graph: {filename}",
                top_k_stacks=10 
            )
            
            partition = random_results.get('partition')
            Q_score = random_results.get('modularity')
            
            if partition and Q_score is not None:
                all_modularity_scores.append(Q_score)
                print(f"-> Q Score recorded: {Q_score:.4f}")

                # --- 4. Export ---

                # A. Export summary TXT
                export_analysis_summary(
                    G_random, 
                    random_results, 
                    output_dir, 
                    filename="analysis_summary_random.txt"
                )
                
                # B. Export full partition CSV
                export_partition_to_csv(
                    partition, 
                    output_dir, 
                    filename="random_full_partition.csv"
                )
                
                # C. Export Top 5 Stacks CSV
                export_top_stacks_table(
                    G_random, 
                    random_results['top_communities_summary'], 
                    TOP_COMMUNITIES_SAMPLE, 
                    output_dir, 
                    filename="top_5_stacks_summary_random.csv"
                )

                # D. Export PageRank core subgraph GEXF
                core_G, core_partition = extract_core_subgraph(
                    G_random, 
                    partition, 
                    num_nodes=PAGERANK_SAMPLE_NODES
                )
                export_for_visualization(
                    core_G, 
                    core_partition, 
                    output_dir, 
                    filename="1_pagerank_core_for_gephi_random.gexf"
                )

                # E. Export Top 5 community subgraph GEXF
                top_comm_G = extract_top_community_subgraph(
                    G_random, 
                    random_results['top_communities_summary'], 
                    top_k=TOP_COMMUNITIES_SAMPLE
                )
                # proj partition
                top_comm_partition = {node: partition[node] for node in top_comm_G.nodes() if node in partition}

                export_for_visualization(
                    top_comm_G, 
                    top_comm_partition, 
                    output_dir, 
                    filename="2_top_5_communities_for_gephi_random.gexf"
                )
                
            else:
                print(f"--- Analysis FAILED for {filename} (Modularity is None) ---")

        except Exception as e:
            print(f"\n--- Analysis FAILED for {filename} ---")
            print(f"Error encountered: {e}")
            continue

    # --- 5. summ JSON ---
    if all_modularity_scores:
        avg_q = np.mean(all_modularity_scores)
        std_q = np.std(all_modularity_scores)
        
        print("\n" + "="*50)
        print(" Random Graph Baseline Summary")
        print(f"Individual Q Scores: {all_modularity_scores}")
        print(f"Average Modularity Q_rand: {avg_q:.4f}")
        print(f"Standard Deviation StdDev_rand: {std_q:.4f}")
        
        baseline_summary = {
            'Q_real': REAL_DATA_MODULARITY_Q,
            'Q_rand_avg': avg_q,
            'Q_rand_std': std_q,
            'Q_rand_individual': all_modularity_scores
        }
        output_file = os.path.join(BASE_OUTPUT_PATH, 'baseline_q_comparison.json')
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(baseline_summary, f, indent=4)
        
        print(f"Baseline comparison data saved to: {output_file}")
        print("="*50)
    else:
        print(" No random graphs were successfully analyzed.")

--- Starting Random Graph Baseline Analysis (Objective 5) ---

Loading random graph from: E:\Network Science Project\01_Louvain_method\RandomData\random_graph_1.pkl
Graph loaded. Nodes=397797, Edges=1819936

  Optimization Complete. Q Score: 0.2389
  - Total Communities Found: 551
-> Q Score recorded: 0.2389
  - Generating Analysis Summary to E:\Network Science Project\01_Louvain_method\RandomData\random_graph_1_output\analysis_summary_random.txt...
  Analysis Summary TXT saved.
  Full partition CSV saved to: E:\Network Science Project\01_Louvain_method\RandomData\random_graph_1_output\random_full_partition.csv
  - Generating Top 5 Stacks summary table...
  Top Stacks Summary CSV saved.
  - [Vis Prep] Calculating PageRank (Top 100)...
   Exported GEXF: 1_pagerank_core_for_gephi_random.gexf
  - [Vis Prep] Extracting Top 5 largest communities...
   Exported GEXF: 2_top_5_communities_for_gephi_random.gexf

Loading random graph from: E:\Network Science Project\01_Louvain_method\RandomData\