## Table of Contents
1. [Setup and Imports](#setup)
2. [Discover Community Artifacts](#discover)
3. [Load and Inspect Community Data](#load)
4. [Community Size Distribution](#size-dist)
5. [Dominant Airline Analysis](#airline)
6. [Community Bridging Nodes](#bridging)
7. [Interpretation](#interpretation)
8. [Write Report Outputs](#write-outputs)
9. [Reproducibility Notes](#reproducibility)

In [None]:
# ============================================================================
# SETUP AND IMPORTS
# ============================================================================

import json
from pathlib import Path
from datetime import datetime
import warnings

import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Project paths
REPO_ROOT = Path.cwd().parent.parent
RESULTS_DIR = REPO_ROOT / "results"
ANALYSIS_DIR = RESULTS_DIR / "analysis"
TABLES_REPORT_DIR = RESULTS_DIR / "tables" / "report"
FIGURES_REPORT_DIR = RESULTS_DIR / "figures" / "report"
WARNINGS_LOG = TABLES_REPORT_DIR / "_warnings.log"

# Notebook identity
NOTEBOOK_ID = "nb04"
NOTEBOOK_NAME = "communities__structure_and_attributes"

# Plotting settings
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("husl")

# Ensure output directories exist
TABLES_REPORT_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_REPORT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Analysis dir exists: {ANALYSIS_DIR.exists()}")

In [None]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def append_warning(message: str, notebook_id: str = NOTEBOOK_ID):
    """Append a warning to the consolidated warnings log."""
    timestamp = datetime.now().isoformat()
    with open(WARNINGS_LOG, "a") as f:
        f.write(f"[{timestamp}] [{notebook_id}] {message}\n")
    print(f"WARNING: {message}")

def safe_load_parquet(path: Path) -> pl.DataFrame | None:
    """Safely load a parquet file, returning None if it fails."""
    try:
        return pl.read_parquet(path)
    except Exception as e:
        append_warning(f"Failed to load {path.name}: {e}")
        return None

<a id="discover"></a>
## 2. Discover Community Artifacts

Search for files containing community-related keywords.

In [None]:
# ============================================================================
# DISCOVER COMMUNITY ARTIFACTS
# ============================================================================

community_keywords = ["community", "leiden", "partition", "membership", "cluster", "sbm"]

# Search in analysis directory
analysis_files = list(ANALYSIS_DIR.glob("*.parquet")) + list(ANALYSIS_DIR.glob("*.csv"))
community_candidates = [
    f for f in analysis_files 
    if any(kw in f.name.lower() for kw in community_keywords)
]

print(f"Found {len(community_candidates)} community-related artifacts:")
for cf in sorted(community_candidates):
    print(f"  - {cf.name}")

# Categorize by network type
airport_community_files = [f for f in community_candidates if "airport" in f.name.lower()]
flight_community_files = [f for f in community_candidates if "flight" in f.name.lower()]

<a id="load"></a>
## 3. Load and Inspect Community Data

In [None]:
# ============================================================================
# LOAD AND INSPECT COMMUNITY DATA
# ============================================================================

community_dfs = {}

for cf in community_candidates:
    if cf.suffix == ".parquet":
        df = safe_load_parquet(cf)
        if df is not None:
            community_dfs[cf.stem] = df
            print(f"\n{'='*60}")
            print(f"{cf.name}")
            print(f"{'='*60}")
            print(f"Shape: {df.shape}")
            print(f"Columns: {df.columns}")
            display(df.head(5).to_pandas())

if len(community_dfs) == 0:
    append_warning("No community data could be loaded")

<a id="size-dist"></a>
## 4. Community Size Distribution

Compute and visualize the distribution of community sizes.

In [None]:
# ============================================================================
# COMMUNITY SIZE DISTRIBUTION
# ============================================================================

size_distributions = {}

for name, df in community_dfs.items():
    # Find community ID column
    comm_col = next((c for c in ["community", "community_id", "cluster", "partition", "leiden"] 
                     if c in df.columns), None)
    
    if comm_col is None:
        # Try to find any column that looks like community assignments
        for c in df.columns:
            if df[c].dtype in [pl.Int64, pl.Int32, pl.UInt32, pl.UInt64]:
                unique_vals = df[c].n_unique()
                if 2 < unique_vals < len(df) // 2:  # Reasonable number of communities
                    comm_col = c
                    break
    
    if comm_col:
        # Compute community sizes
        sizes = (
            df.group_by(comm_col)
            .agg(pl.count().alias("size"))
            .sort("size", descending=True)
            .with_row_index("rank", offset=1)
        )
        
        size_distributions[name] = {
            "data": sizes.to_pandas(),
            "comm_col": comm_col,
            "n_communities": sizes.height,
            "largest": sizes["size"].max(),
            "smallest": sizes["size"].min()
        }
        
        print(f"\n{name}:")
        print(f"  Community column: {comm_col}")
        print(f"  Number of communities: {sizes.height}")
        print(f"  Largest community: {sizes['size'].max()} nodes")
        print(f"  Smallest community: {sizes['size'].min()} nodes")
    else:
        append_warning(f"Could not identify community column in {name}")

In [None]:
# ============================================================================
# PLOT COMMUNITY SIZE DISTRIBUTIONS
# ============================================================================

if len(size_distributions) > 0:
    for name, info in size_distributions.items():
        sizes_df = info["data"]
        
        # Take top 30 for readability
        top_30 = sizes_df.head(30)
        
        fig, ax = plt.subplots(figsize=(12, 8))
        
        colors = sns.color_palette("coolwarm", len(top_30))
        bars = ax.bar(range(len(top_30)), top_30["size"], color=colors)
        
        ax.set_xlabel("Community Rank")
        ax.set_ylabel("Community Size (nodes)")
        ax.set_title(f"Community Size Distribution: {name}\n(Top 30 of {info['n_communities']} communities)")
        
        # Add community ID labels for top 10
        ax.set_xticks(range(min(10, len(top_30))))
        ax.set_xticklabels(top_30[info["comm_col"]].head(10).astype(str), rotation=45)
        
        plt.tight_layout()
        fig_path = FIGURES_REPORT_DIR / f"{NOTEBOOK_ID}_community_sizes__{name}.png"
        plt.savefig(fig_path, dpi=150)
        plt.show()
        print(f"âœ… Saved: {fig_path.name}")
else:
    print("Not available: no community size data to plot")

<a id="airline"></a>
## 5. Dominant Airline Analysis

If airline attributes are available, identify the dominant airline per community.

In [None]:
# ============================================================================
# DOMINANT AIRLINE ANALYSIS
# ============================================================================

airline_analysis_available = False

for name, df in community_dfs.items():
    # Check for airline column
    airline_col = next((c for c in ["carrier", "airline", "OP_UNIQUE_CARRIER", "op_carrier"] 
                        if c in df.columns), None)
    
    comm_col = size_distributions.get(name, {}).get("comm_col")
    
    if airline_col and comm_col:
        airline_analysis_available = True
        print(f"\n{'='*60}")
        print(f"DOMINANT AIRLINE ANALYSIS: {name}")
        print(f"{'='*60}")
        
        # Compute dominant airline per community
        dominant = (
            df.group_by([comm_col, airline_col])
            .agg(pl.count().alias("count"))
            .sort([comm_col, "count"], descending=[False, True])
            .group_by(comm_col)
            .first()
        )
        
        # Add community size and compute share
        comm_sizes = df.group_by(comm_col).agg(pl.count().alias("total_size"))
        dominant = dominant.join(comm_sizes, on=comm_col)
        dominant = dominant.with_columns(
            (pl.col("count") / pl.col("total_size")).alias("dominant_share")
        ).sort("total_size", descending=True)
        
        print(f"\nTop 15 communities by size with dominant airline:")
        display(dominant.head(15).to_pandas())
        
        # Save for report
        dominant_path = TABLES_REPORT_DIR / f"{NOTEBOOK_ID}_community_dominant_airline__{name}.csv"
        dominant.to_pandas().to_csv(dominant_path, index=False)
        print(f"âœ… Saved: {dominant_path.name}")
    else:
        if not airline_col:
            print(f"\n{name}: No airline column found - dominant airline analysis not available")

if not airline_analysis_available:
    print("\nNot available: no community data has airline attributes")

<a id="bridging"></a>
## 6. Community Bridging Nodes

Identify nodes that may bridge between communities (for cross-reference with centrality).

In [None]:
# ============================================================================
# COMMUNITY BRIDGING NODES (PLACEHOLDER)
# ============================================================================

# This analysis requires edge data to identify nodes connected to multiple communities
# Will be synthesized in Notebook 09 with centrality data

print("Community bridging analysis will be performed in Notebook 09 (Synthesis)")
print("This requires joining community assignments with edge data and centrality metrics.")

<a id="interpretation"></a>
## 7. Interpretation

### Key Findings (Evidence-Grounded)

*(Populated after running cells above)*

### Mechanistic Explanation

- **Leiden algorithm**: Optimizes modularity with guaranteed community connectivity
- **SBM (Stochastic Block Model)**: Probabilistic generative model for community structure
- **Resolution parameter**: Controls granularity of detected communities

### Alternative Explanations
1. Carrier-dominated communities may reflect hub-and-spoke network design
2. Geographic communities may emerge from regional travel patterns
3. Many small communities may indicate over-resolution or network fragmentation

### Evidence Links
- Table: `results/tables/report/nb04_community_sizes.csv`
- Figure: `results/figures/report/nb04_community_sizes_*.png`

<a id="write-outputs"></a>
## 8. Write Report Outputs

In [None]:
# ============================================================================
# WRITE REPORT OUTPUTS
# ============================================================================

# Combine all community size distributions
if len(size_distributions) > 0:
    combined_rows = []
    for name, info in size_distributions.items():
        sizes_df = info["data"].copy()
        sizes_df["source"] = name
        combined_rows.append(sizes_df)
    
    combined_df = pd.concat(combined_rows, ignore_index=True)
    combined_path = TABLES_REPORT_DIR / f"{NOTEBOOK_ID}_community_sizes.csv"
    combined_df.to_csv(combined_path, index=False)
    print(f"âœ… Wrote: {combined_path}")

print(f"\nðŸ“‹ All {NOTEBOOK_ID} outputs written.")

<a id="reproducibility"></a>
## 9. Reproducibility Notes

### Input Files Consumed
- `results/analysis/airport_leiden_membership.parquet`
- `results/analysis/airport_sbm_membership.parquet`
- `results/analysis/flight_leiden_membership.parquet`

### Assumptions Made
1. Community assignments are from a single run (deterministic with fixed seed)
2. Node identifiers match between community and centrality tables
3. Resolution parameters are as specified in config.yaml

### Sorting/Ordering
- Communities ranked by size descending
- Stable tie-breaking by community ID ascending

### Caveats
- Resolution parameter choice affects number and size of communities
- Different algorithms (Leiden vs SBM) may yield different partitions
- Airline dominance is computed from node-level carrier attributes if available

### Outputs Generated
| Artifact | Path |
|----------|------|
| Community Sizes | `results/tables/report/nb04_community_sizes.csv` |
| Size Distribution Figures | `results/figures/report/nb04_community_sizes__*.png` |
| Dominant Airline | `results/tables/report/nb04_community_dominant_airline__*.csv` |