## Table of Contents
1. [Setup and Imports](#setup)
2. [Discover Embeddings/LinkPred Artifacts](#discover)
3. [Load and Inspect Metrics](#load-metrics)
4. [Embedding Quality Analysis](#embedding-quality)
5. [Top Predictions Analysis](#top-predictions)
6. [Prediction Plausibility](#plausibility)
7. [Interpretation](#interpretation)
8. [Write Report Outputs](#write-outputs)
9. [Reproducibility Notes](#reproducibility)

In [None]:
# ============================================================================
# SETUP AND IMPORTS
# ============================================================================

import json
from pathlib import Path
from datetime import datetime
import warnings

import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Project paths
REPO_ROOT = Path.cwd().parent.parent
RESULTS_DIR = REPO_ROOT / "results"
ANALYSIS_DIR = RESULTS_DIR / "analysis"
TABLES_DIR = RESULTS_DIR / "tables"
TABLES_REPORT_DIR = RESULTS_DIR / "tables" / "report"
FIGURES_REPORT_DIR = RESULTS_DIR / "figures" / "report"
WARNINGS_LOG = TABLES_REPORT_DIR / "_warnings.log"

# Notebook identity
NOTEBOOK_ID = "nb07"
NOTEBOOK_NAME = "embeddings_linkpred__evaluation_and_plausibility"

# Plotting settings
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("husl")

# Ensure output directories exist
TABLES_REPORT_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_REPORT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Analysis dir exists: {ANALYSIS_DIR.exists()}")

In [None]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def append_warning(message: str, notebook_id: str = NOTEBOOK_ID):
    """Append a warning to the consolidated warnings log."""
    timestamp = datetime.now().isoformat()
    with open(WARNINGS_LOG, "a") as f:
        f.write(f"[{timestamp}] [{notebook_id}] {message}\n")
    print(f"WARNING: {message}")

def safe_load_parquet(path: Path) -> pl.DataFrame | None:
    """Safely load a parquet file, returning None if it fails."""
    try:
        return pl.read_parquet(path)
    except Exception as e:
        append_warning(f"Failed to load {path.name}: {e}")
        return None

def flatten_metrics(metrics_dict: dict, prefix: str = "") -> dict:
    """Flatten nested metrics dictionary."""
    flat = {}
    for k, v in metrics_dict.items():
        key = f"{prefix}{k}" if prefix else k
        if isinstance(v, dict):
            flat.update(flatten_metrics(v, f"{key}__"))
        else:
            flat[key] = v
    return flat

<a id="discover"></a>
## 2. Discover Embeddings/LinkPred Artifacts

In [None]:
# ============================================================================
# DISCOVER ARTIFACTS
# ============================================================================

embed_keywords = ["embed", "node2vec", "linkpred", "auc", "ap", "mrr", "hits", "prediction"]

# Search in analysis directory
analysis_files = list(ANALYSIS_DIR.glob("*.parquet")) + list(ANALYSIS_DIR.glob("*.json"))
embed_candidates = [
    f for f in analysis_files 
    if any(kw in f.name.lower() for kw in embed_keywords)
]

# Search in tables directory for predictions
table_files = list(TABLES_DIR.glob("*linkpred*.csv")) + list(TABLES_DIR.glob("*prediction*.csv"))

print(f"Found {len(embed_candidates)} embedding/linkpred artifacts in analysis/:")
for ef in sorted(embed_candidates):
    print(f"  - {ef.name}")

print(f"\nFound {len(table_files)} prediction tables in tables/:")
for tf in sorted(table_files):
    print(f"  - {tf.name}")

# Primary files
linkpred_metrics_file = ANALYSIS_DIR / "linkpred_metrics.json"
embeddings_file = ANALYSIS_DIR / "airport_embeddings.parquet"
predictions_file = TABLES_DIR / "linkpred_top_predictions.csv"

print(f"\nLink pred metrics exists: {linkpred_metrics_file.exists()}")
print(f"Embeddings exists: {embeddings_file.exists()}")
print(f"Top predictions exists: {predictions_file.exists()}")

<a id="load-metrics"></a>
## 3. Load and Inspect Metrics

In [None]:
# ============================================================================
# LOAD AND INSPECT METRICS
# ============================================================================

linkpred_metrics = None
metrics_flat = {}

if linkpred_metrics_file.exists():
    with open(linkpred_metrics_file) as f:
        linkpred_metrics = json.load(f)
    
    print("LINK PREDICTION METRICS:")
    print(json.dumps(linkpred_metrics, indent=2))
    
    # Flatten for table output
    metrics_flat = flatten_metrics(linkpred_metrics)
else:
    append_warning("linkpred_metrics.json not found")
    print("Not available: link prediction metrics not found")

In [None]:
# ============================================================================
# SUMMARIZE METRICS IN TABLE FORM
# ============================================================================

if len(metrics_flat) > 0:
    # Create metrics table
    metrics_df = pd.DataFrame([
        {"metric": k, "value": v} for k, v in metrics_flat.items()
        if isinstance(v, (int, float))
    ])
    
    print("\nFLATTENED METRICS TABLE:")
    display(metrics_df)
    
    # Highlight key performance indicators
    key_metrics = ["auc", "ap", "average_precision", "roc_auc", "mrr", "hits@10"]
    key_rows = metrics_df[metrics_df["metric"].str.lower().str.contains("|".join(key_metrics))]
    
    if len(key_rows) > 0:
        print("\nüìä KEY PERFORMANCE METRICS:")
        display(key_rows)
else:
    metrics_df = pd.DataFrame()
    print("Not available: no metrics to display")

<a id="embedding-quality"></a>
## 4. Embedding Quality Analysis

Perform basic sanity checks on embeddings if available.

In [None]:
# ============================================================================
# EMBEDDING QUALITY ANALYSIS
# ============================================================================

embeddings = None

if embeddings_file.exists():
    embeddings = safe_load_parquet(embeddings_file)
    
    if embeddings is not None:
        print(f"Embeddings shape: {embeddings.shape}")
        print(f"Columns: {embeddings.columns}")
        
        # Identify embedding dimension columns (usually numeric, many columns)
        numeric_cols = [c for c in embeddings.columns 
                       if embeddings[c].dtype in [pl.Float64, pl.Float32]]
        
        if len(numeric_cols) > 5:  # Likely embedding dimensions
            print(f"\nDetected {len(numeric_cols)} embedding dimensions")
            
            # Compute L2 norms
            embed_matrix = embeddings.select(numeric_cols).to_numpy()
            norms = np.linalg.norm(embed_matrix, axis=1)
            
            print(f"\nEmbedding Norm Statistics:")
            print(f"  Mean: {norms.mean():.4f}")
            print(f"  Std: {norms.std():.4f}")
            print(f"  Min: {norms.min():.4f}")
            print(f"  Max: {norms.max():.4f}")
            
            # Check for degenerate embeddings
            zero_norms = (norms < 1e-6).sum()
            if zero_norms > 0:
                append_warning(f"{zero_norms} embeddings have near-zero norm")
            
            # Plot norm distribution
            fig, ax = plt.subplots(figsize=(10, 5))
            ax.hist(norms, bins=50, edgecolor="white", alpha=0.8)
            ax.set_xlabel("Embedding L2 Norm")
            ax.set_ylabel("Frequency")
            ax.set_title(f"Embedding Norm Distribution (dim={len(numeric_cols)})")
            ax.axvline(norms.mean(), color="red", linestyle="--", label=f"Mean: {norms.mean():.3f}")
            ax.legend()
            
            plt.tight_layout()
            fig_path = FIGURES_REPORT_DIR / f"{NOTEBOOK_ID}_embedding_norms_distribution.png"
            plt.savefig(fig_path, dpi=150)
            plt.show()
            print(f"‚úÖ Saved: {fig_path.name}")
        else:
            print("Could not identify embedding dimensions")
else:
    print("Not available: embeddings file not found")

<a id="top-predictions"></a>
## 5. Top Predictions Analysis

Examine the top predicted new links.

In [None]:
# ============================================================================
# TOP PREDICTIONS ANALYSIS
# ============================================================================

predictions = None

if predictions_file.exists():
    predictions = pd.read_csv(predictions_file)
    print(f"Top predictions shape: {predictions.shape}")
    print(f"Columns: {list(predictions.columns)}")
    display(predictions.head(20))
else:
    print("Not available: top predictions file not found")
    
    # Try alternative locations
    for alt_file in TABLES_DIR.glob("*predict*.csv"):
        print(f"Found alternative: {alt_file.name}")
        predictions = pd.read_csv(alt_file)
        display(predictions.head(10))
        break

<a id="plausibility"></a>
## 6. Prediction Plausibility

Assess whether predicted links are plausible based on network structure.

In [None]:
# ============================================================================
# PREDICTION PLAUSIBILITY ANALYSIS
# ============================================================================

if predictions is not None and len(predictions) > 0:
    # Identify source/target columns
    src_col = next((c for c in ["source", "origin", "airport_1", "src"] if c in predictions.columns), None)
    dst_col = next((c for c in ["target", "dest", "airport_2", "dst"] if c in predictions.columns), None)
    score_col = next((c for c in ["score", "probability", "pred_score", "link_prob"] if c in predictions.columns), None)
    
    if src_col and dst_col:
        print(f"\nPrediction columns: source={src_col}, target={dst_col}, score={score_col}")
        
        # Check if predictions cluster around mega-hubs
        top_sources = predictions[src_col].value_counts().head(10)
        top_targets = predictions[dst_col].value_counts().head(10)
        
        print("\nMost frequent SOURCE airports in predictions:")
        print(top_sources)
        
        print("\nMost frequent TARGET airports in predictions:")
        print(top_targets)
        
        # Plausibility assessment
        mega_hubs = {"ATL", "ORD", "DFW", "DEN", "LAX", "CLT", "PHX", "IAH", "SFO", "EWR"}
        hub_predictions = predictions[
            predictions[src_col].isin(mega_hubs) | predictions[dst_col].isin(mega_hubs)
        ]
        hub_rate = len(hub_predictions) / len(predictions) if len(predictions) > 0 else 0
        
        print(f"\nüìä PLAUSIBILITY CHECK:")
        print(f"   Predictions involving mega-hubs: {len(hub_predictions)} ({hub_rate:.1%})")
        if hub_rate > 0.5:
            print("   ‚ö†Ô∏è High hub concentration suggests predictions may be trivial")
        else:
            print("   ‚úÖ Predictions show structural diversity")
    else:
        append_warning(f"Could not identify source/target columns in predictions")
else:
    print("Not available: no predictions to analyze")

<a id="interpretation"></a>
## 7. Interpretation

### Key Findings (Evidence-Grounded)

*(Populated after running cells above)*

### Mechanistic Explanation

- **Node embeddings**: Capture structural similarity through random walk or matrix factorization
- **Link prediction**: Dot product of embeddings estimates link probability
- **AUC/AP metrics**: Measure ranking quality for positive vs negative edges

### Evaluation Caveats
1. **Time split leakage**: If test edges overlap temporally with training, performance is inflated
2. **Class imbalance**: Link prediction has extreme negative/positive imbalance
3. **Trivial predictions**: High-degree nodes are easy targets

### Evidence Links
- Table: `results/tables/report/nb07_linkpred_metrics_flat.csv`
- Figure: `results/figures/report/nb07_embedding_norms_distribution.png`

<a id="write-outputs"></a>
## 8. Write Report Outputs

In [None]:
# ============================================================================
# WRITE REPORT OUTPUTS
# ============================================================================

# Write flattened metrics
if len(metrics_df) > 0:
    metrics_path = TABLES_REPORT_DIR / f"{NOTEBOOK_ID}_linkpred_metrics_flat.csv"
    metrics_df.to_csv(metrics_path, index=False)
    print(f"‚úÖ Wrote: {metrics_path}")

# Write annotated predictions
if predictions is not None and len(predictions) > 0:
    pred_path = TABLES_REPORT_DIR / f"{NOTEBOOK_ID}_top_predictions_annotated.csv"
    predictions.to_csv(pred_path, index=False)
    print(f"‚úÖ Wrote: {pred_path}")

print(f"\nüìã All {NOTEBOOK_ID} outputs written.")

<a id="reproducibility"></a>
## 9. Reproducibility Notes

### Input Files Consumed
- `results/analysis/linkpred_metrics.json`
- `results/analysis/airport_embeddings.parquet`
- `results/tables/linkpred_top_predictions.csv`

### Assumptions Made
1. Embeddings trained with node2vec or similar random walk method
2. Link prediction uses temporal train/test split
3. Metrics computed on hold-out edge set

### Outputs Generated
| Artifact | Path |
|----------|------|
| Metrics Table | `results/tables/report/nb07_linkpred_metrics_flat.csv` |
| Embedding Norms | `results/figures/report/nb07_embedding_norms_distribution.png` |
| Annotated Predictions | `results/tables/report/nb07_top_predictions_annotated.csv` |