## Table of Contents
1. [Setup and Imports](#setup)
2. [Notebook I/O Inventory](#io-inventory)
3. [Assumptions Table](#assumptions)
4. [Limitations Table](#limitations)
5. [Reproducibility Checklist](#checklist)
6. [Warnings Log Review](#warnings)
7. [Final Summary](#summary)
8. [Write Report Outputs](#write-outputs)

In [None]:
# ============================================================================
# SETUP AND IMPORTS
# ============================================================================

import json
from pathlib import Path
from datetime import datetime
import warnings

import pandas as pd
import numpy as np

# Project paths
REPO_ROOT = Path.cwd().parent.parent
RESULTS_DIR = REPO_ROOT / "results"
ANALYSIS_DIR = RESULTS_DIR / "analysis"
BUSINESS_DIR = RESULTS_DIR / "business"
NETWORKS_DIR = RESULTS_DIR / "networks"
TABLES_DIR = RESULTS_DIR / "tables"
FIGURES_DIR = RESULTS_DIR / "figures"
TABLES_REPORT_DIR = TABLES_DIR / "report"
FIGURES_REPORT_DIR = FIGURES_DIR / "report"
LOGS_DIR = RESULTS_DIR / "logs"
WARNINGS_LOG = TABLES_REPORT_DIR / "_warnings.log"

# Notebook identity
NOTEBOOK_ID = "nb10"
NOTEBOOK_NAME = "appendix__assumptions_limitations_reproducibility"

# Ensure output directories exist
TABLES_REPORT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Results dir exists: {RESULTS_DIR.exists()}")
print(f"Warnings log exists: {WARNINGS_LOG.exists()}")

In [None]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def append_warning(message: str, notebook_id: str = NOTEBOOK_ID):
    """Append a warning to the consolidated warnings log."""
    timestamp = datetime.now().isoformat()
    with open(WARNINGS_LOG, "a") as f:
        f.write(f"[{timestamp}] [{notebook_id}] {message}\n")
    print(f"WARNING: {message}")

<a id="io-inventory"></a>
## 2. Notebook I/O Inventory

Complete catalog of all inputs consumed and outputs produced by each notebook.

In [None]:
# ============================================================================
# NOTEBOOK I/O INVENTORY
# ============================================================================

io_inventory = [
    # Notebook 01
    {"notebook": "nb01", "direction": "input", "path": "results/logs/*_manifest.json", "description": "Pipeline run manifests"},
    {"notebook": "nb01", "direction": "output", "path": "results/tables/report/nb01_reconciliation_table.csv", "description": "Manifest reconciliation"},
    {"notebook": "nb01", "direction": "output", "path": "results/tables/report/nb01_gaps_table.csv", "description": "Missing artifacts"},
    
    # Notebook 02
    {"notebook": "nb02", "direction": "input", "path": "results/networks/airport_nodes.parquet", "description": "Airport node attributes"},
    {"notebook": "nb02", "direction": "input", "path": "results/networks/airport_edges.parquet", "description": "Airport edge list"},
    {"notebook": "nb02", "direction": "output", "path": "results/tables/report/nb02_network_stats.csv", "description": "Network statistics"},
    {"notebook": "nb02", "direction": "output", "path": "results/tables/report/nb02_top_routes.csv", "description": "Top routes by weight"},
    {"notebook": "nb02", "direction": "output", "path": "results/figures/report/nb02_degree_distribution.png", "description": "Degree distribution"},
    
    # Notebook 03
    {"notebook": "nb03", "direction": "input", "path": "results/analysis/airport_centrality.parquet", "description": "Centrality metrics"},
    {"notebook": "nb03", "direction": "output", "path": "results/tables/report/nb03_centrality_topK.csv", "description": "Top-K central airports"},
    {"notebook": "nb03", "direction": "output", "path": "results/figures/report/nb03_centrality_distributions.png", "description": "Centrality distributions"},
    
    # Notebook 04
    {"notebook": "nb04", "direction": "input", "path": "results/analysis/airport_leiden_membership.parquet", "description": "Leiden communities"},
    {"notebook": "nb04", "direction": "output", "path": "results/tables/report/nb04_community_sizes.csv", "description": "Community sizes"},
    {"notebook": "nb04", "direction": "output", "path": "results/figures/report/nb04_community_size_distribution.png", "description": "Community distribution"},
    
    # Notebook 05
    {"notebook": "nb05", "direction": "input", "path": "results/analysis/robustness_curves.parquet", "description": "Robustness curves"},
    {"notebook": "nb05", "direction": "input", "path": "results/analysis/robustness_summary.json", "description": "Robustness summary"},
    {"notebook": "nb05", "direction": "output", "path": "results/tables/report/nb05_robustness_metrics.csv", "description": "Robustness metrics"},
    {"notebook": "nb05", "direction": "output", "path": "results/figures/report/nb05_robustness_curves.png", "description": "Robustness curves plot"},
    
    # Notebook 06
    {"notebook": "nb06", "direction": "input", "path": "results/analysis/delay_cascades.parquet", "description": "Delay cascades"},
    {"notebook": "nb06", "direction": "input", "path": "results/analysis/delay_propagation_summary.json", "description": "Propagation summary"},
    {"notebook": "nb06", "direction": "output", "path": "results/tables/report/nb06_superspreaders.csv", "description": "Superspreader airports"},
    {"notebook": "nb06", "direction": "output", "path": "results/figures/report/nb06_cascade_distribution.png", "description": "Cascade size distribution"},
    
    # Notebook 07
    {"notebook": "nb07", "direction": "input", "path": "results/analysis/linkpred_metrics.json", "description": "Link prediction metrics"},
    {"notebook": "nb07", "direction": "input", "path": "results/analysis/airport_embeddings.parquet", "description": "Node embeddings"},
    {"notebook": "nb07", "direction": "output", "path": "results/tables/report/nb07_linkpred_summary.csv", "description": "Link prediction summary"},
    
    # Notebook 08
    {"notebook": "nb08", "direction": "input", "path": "results/business/airline_summary_metrics.parquet", "description": "Airline KPIs"},
    {"notebook": "nb08", "direction": "input", "path": "results/business/hub_concentration.parquet", "description": "Hub concentration"},
    {"notebook": "nb08", "direction": "output", "path": "results/tables/report/nb08_airline_kpi_summary.csv", "description": "Airline KPI summary"},
    
    # Notebook 09
    {"notebook": "nb09", "direction": "input", "path": "results/tables/report/nb*.csv", "description": "All prior notebook tables"},
    {"notebook": "nb09", "direction": "output", "path": "results/tables/report/nb09_master_evidence_index.csv", "description": "Evidence index"},
    {"notebook": "nb09", "direction": "output", "path": "results/tables/report/nb09_cross_domain_synthesis.csv", "description": "Cross-domain synthesis"},
    
    # Notebook 10
    {"notebook": "nb10", "direction": "input", "path": "results/tables/report/_warnings.log", "description": "Warnings log"},
    {"notebook": "nb10", "direction": "output", "path": "results/tables/report/nb10_notebook_io_index.csv", "description": "This I/O index"},
    {"notebook": "nb10", "direction": "output", "path": "results/tables/report/nb10_assumptions_table.csv", "description": "Assumptions catalog"},
    {"notebook": "nb10", "direction": "output", "path": "results/tables/report/nb10_limitations_table.csv", "description": "Limitations catalog"},
]

io_df = pd.DataFrame(io_inventory)
print(f"Total I/O entries: {len(io_df)}")

# Summary by notebook and direction
print("\nI/O by notebook:")
display(io_df.groupby(["notebook", "direction"]).size().unstack(fill_value=0))

<a id="assumptions"></a>
## 3. Assumptions Table

Comprehensive list of methodological assumptions.

In [None]:
# ============================================================================
# ASSUMPTIONS TABLE
# ============================================================================

assumptions = [
    # Data assumptions
    {"category": "data", "assumption": "Flight data is representative of 2024 operations", "impact": "medium", "mitigation": "Use full year data"},
    {"category": "data", "assumption": "Missing values are missing at random (MAR)", "impact": "low", "mitigation": "Document missingness rates"},
    {"category": "data", "assumption": "Cancelled flights excluded from network construction", "impact": "medium", "mitigation": "Sensitivity analysis possible"},
    
    # Network assumptions
    {"category": "network", "assumption": "Undirected edges for airport network", "impact": "medium", "mitigation": "Could rebuild as directed"},
    {"category": "network", "assumption": "Edge weight = flight count (not passengers)", "impact": "medium", "mitigation": "Weight by distance alternative"},
    {"category": "network", "assumption": "Flight network limited to top-50 airports", "impact": "high", "mitigation": "Computational constraint documented"},
    
    # Analysis assumptions
    {"category": "analysis", "assumption": "Leiden resolution parameter from config", "impact": "medium", "mitigation": "Resolution sweep possible"},
    {"category": "analysis", "assumption": "IC model with uniform infection probability", "impact": "high", "mitigation": "Document beta parameter"},
    {"category": "analysis", "assumption": "Link prediction uses temporal split", "impact": "low", "mitigation": "Standard evaluation protocol"},
    
    # Business assumptions
    {"category": "business", "assumption": "Cost proxies use literature estimates", "impact": "high", "mitigation": "Document parameter sources"},
    {"category": "business", "assumption": "Hub concentration = top-3 airport share", "impact": "low", "mitigation": "Alternative definitions possible"},
]

assumptions_df = pd.DataFrame(assumptions)
print(f"Total assumptions: {len(assumptions_df)}")

# Summary by category
print("\nAssumptions by category:")
display(assumptions_df.groupby("category").size())

print("\nHigh-impact assumptions:")
display(assumptions_df[assumptions_df["impact"] == "high"])

<a id="limitations"></a>
## 4. Limitations Table

Known limitations and threats to validity.

In [None]:
# ============================================================================
# LIMITATIONS TABLE
# ============================================================================

limitations = [
    # Internal validity
    {"type": "internal", "limitation": "Correlation does not imply causation", "applies_to": "all analyses", "severity": "high"},
    {"type": "internal", "limitation": "IC model is a simplification of delay dynamics", "applies_to": "delay propagation", "severity": "medium"},
    {"type": "internal", "limitation": "Robustness simulation may not reflect real failures", "applies_to": "robustness analysis", "severity": "medium"},
    
    # External validity
    {"type": "external", "limitation": "US domestic flights only (no international)", "applies_to": "all analyses", "severity": "medium"},
    {"type": "external", "limitation": "2024 data may not generalize to other years", "applies_to": "all analyses", "severity": "low"},
    {"type": "external", "limitation": "Excludes cargo and general aviation", "applies_to": "network construction", "severity": "low"},
    
    # Construct validity
    {"type": "construct", "limitation": "Centrality may not capture operational importance", "applies_to": "centrality analysis", "severity": "medium"},
    {"type": "construct", "limitation": "Communities may not reflect geographic/business regions", "applies_to": "community detection", "severity": "medium"},
    
    # Statistical validity
    {"type": "statistical", "limitation": "No confidence intervals for many metrics", "applies_to": "all analyses", "severity": "medium"},
    {"type": "statistical", "limitation": "Single random seed (deterministic but arbitrary)", "applies_to": "stochastic analyses", "severity": "low"},
]

limitations_df = pd.DataFrame(limitations)
print(f"Total limitations: {len(limitations_df)}")

# Summary by type
print("\nLimitations by type:")
display(limitations_df.groupby("type").size())

print("\nHigh-severity limitations:")
display(limitations_df[limitations_df["severity"] == "high"])

<a id="checklist"></a>
## 5. Reproducibility Checklist

Final verification of report integrity.

In [None]:
# ============================================================================
# REPRODUCIBILITY CHECKLIST
# ============================================================================

checklist_items = [
    {"item": "All pipeline scripts have manifests", "check": "nb01", "verified": None},
    {"item": "Network files are present and non-empty", "check": "nb02", "verified": None},
    {"item": "Centrality computed for all airports", "check": "nb03", "verified": None},
    {"item": "Community assignments cover all nodes", "check": "nb04", "verified": None},
    {"item": "Robustness curves are monotonic", "check": "nb05", "verified": None},
    {"item": "Delay cascades have valid sizes", "check": "nb06", "verified": None},
    {"item": "Link prediction metrics in [0,1]", "check": "nb07", "verified": None},
    {"item": "Business metrics are airline-complete", "check": "nb08", "verified": None},
    {"item": "Evidence index covers all research questions", "check": "nb09", "verified": None},
    {"item": "No unresolved warnings in log", "check": "nb10", "verified": None},
]

# Auto-verify some items
checklist_items[0]["verified"] = len(list(LOGS_DIR.glob("*_manifest.json"))) >= 10
checklist_items[1]["verified"] = (NETWORKS_DIR / "airport_nodes.parquet").exists()
checklist_items[2]["verified"] = (ANALYSIS_DIR / "airport_centrality.parquet").exists()
checklist_items[3]["verified"] = (ANALYSIS_DIR / "airport_leiden_membership.parquet").exists()
checklist_items[4]["verified"] = (ANALYSIS_DIR / "robustness_curves.parquet").exists()
checklist_items[5]["verified"] = (ANALYSIS_DIR / "delay_cascades.parquet").exists()
checklist_items[6]["verified"] = (ANALYSIS_DIR / "linkpred_metrics.json").exists()
checklist_items[7]["verified"] = (BUSINESS_DIR / "airline_summary_metrics.parquet").exists()

checklist_df = pd.DataFrame(checklist_items)
print("Reproducibility Checklist:")
display(checklist_df)

# Summary
verified_count = checklist_df["verified"].sum()
total_count = len(checklist_df)
print(f"\nVerified: {verified_count}/{total_count} ({100*verified_count/total_count:.0f}%)")

<a id="warnings"></a>
## 6. Warnings Log Review

Examine all warnings generated during notebook execution.

In [None]:
# ============================================================================
# WARNINGS LOG REVIEW
# ============================================================================

if WARNINGS_LOG.exists():
    with open(WARNINGS_LOG) as f:
        warnings_text = f.read()
    
    if warnings_text.strip():
        lines = warnings_text.strip().split("\n")
        print(f"Total warnings: {len(lines)}")
        print("\n" + "="*60)
        print("WARNINGS LOG CONTENTS")
        print("="*60)
        for line in lines[-20:]:  # Show last 20
            print(line)
        if len(lines) > 20:
            print(f"... ({len(lines) - 20} more warnings not shown)")
    else:
        print("âœ… No warnings recorded!")
else:
    print("Warnings log does not exist yet.")

<a id="summary"></a>
## 7. Final Summary

### Pipeline Completeness
- [ ] All 11 scripts (00-10) executed successfully
- [ ] All manifests present in `results/logs/`
- [ ] All network files present in `results/networks/`
- [ ] All analysis files present in `results/analysis/`
- [ ] All business files present in `results/business/`

### Notebook Completeness
- [ ] Notebooks 01-10 executed without fatal errors
- [ ] All expected outputs written to `results/tables/report/`
- [ ] All expected figures written to `results/figures/report/`
- [ ] Warnings log reviewed and addressed

### Report Readiness
- [ ] Evidence index maps all claims to artifacts
- [ ] Assumptions documented and categorized
- [ ] Limitations acknowledged with severity ratings
- [ ] I/O inventory complete for reproducibility

<a id="write-outputs"></a>
## 8. Write Report Outputs

In [None]:
# ============================================================================
# WRITE REPORT OUTPUTS
# ============================================================================

# Write I/O index
io_path = TABLES_REPORT_DIR / f"{NOTEBOOK_ID}_notebook_io_index.csv"
io_df.to_csv(io_path, index=False)
print(f"âœ… Wrote: {io_path}")

# Write assumptions table
assumptions_path = TABLES_REPORT_DIR / f"{NOTEBOOK_ID}_assumptions_table.csv"
assumptions_df.to_csv(assumptions_path, index=False)
print(f"âœ… Wrote: {assumptions_path}")

# Write limitations table
limitations_path = TABLES_REPORT_DIR / f"{NOTEBOOK_ID}_limitations_table.csv"
limitations_df.to_csv(limitations_path, index=False)
print(f"âœ… Wrote: {limitations_path}")

# Write checklist
checklist_path = TABLES_REPORT_DIR / f"{NOTEBOOK_ID}_final_checklist.csv"
checklist_df.to_csv(checklist_path, index=False)
print(f"âœ… Wrote: {checklist_path}")

print(f"\nðŸ“‹ All {NOTEBOOK_ID} outputs written.")
print("\n" + "="*60)
print("APPENDIX NOTEBOOK COMPLETE")
print("="*60)