# 06 Post-Deployment Analysis (MIT Timing + Proxy Lag)

**Purpose**  
Summarize MIT timing distributions and compare risk domains by timing while providing a cautious reporting-lag proxy.

**Outputs preserved:**
- `80_mit_timing_counts.png`
- `81_domain_by_timing_pct.csv`
- `82_reporting_lag_proxy_hist.png`

## Configuration

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

from src.notebook_utils import ensure_output_dir, load_data

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "data").exists() and (PROJECT_ROOT.parent / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
DATA_PATH = PROJECT_ROOT / "data"
OUTPUT_PATH = ensure_output_dir(PROJECT_ROOT / "outputs" / "figures")
TOP_N = 15
DATE_CANDIDATES = ["date_published", "date_submitted", "date_modified", "date_downloaded"]
REPORT_COLUMNS = ["url", "source_domain", "date_published", "date_submitted", "date_modified", "date_downloaded"]

loaded_tables = load_data(
    DATA_PATH,
    tables=["mit", "reports", "submissions"],
    reports_usecols=REPORT_COLUMNS,
)

mit_df = loaded_tables["mit"]
reports_df = loaded_tables["reports"]
submissions_df = loaded_tables["submissions"]

if reports_df is None or submissions_df is None:
    raise FileNotFoundError("Required tables missing: reports.csv or submissions.csv.")

print("MIT:", None if mit_df is None else mit_df.shape)
print("Reports:", reports_df.shape)
print("Submissions:", submissions_df.shape)

## Timing Distributions

In [None]:
if mit_df is None or "timing" not in mit_df.columns:
    print("MIT timing column not found.")
else:
    timing_counts = mit_df["timing"].dropna().astype(str).value_counts()
    fig, ax = plt.subplots(figsize=(8, 4.5))
    timing_counts.sort_values().plot(kind="barh", ax=ax)
    ax.set_title("MIT Timing Distribution (counts)")
    ax.set_xlabel("Count")
    fig.savefig(OUTPUT_PATH / "80_mit_timing_counts.png", bbox_inches="tight", dpi=200)
    plt.show()

    timing_pct = (timing_counts / timing_counts.sum() * 100).round(1)
    print("MIT Timing (%):")
    display(timing_pct)

In [None]:
if mit_df is not None and {"risk_domain", "timing"}.issubset(set(mit_df.columns)):
    timing_domain_df = mit_df.dropna(subset=["risk_domain", "timing"]).copy()
    timing_domain_df["risk_domain"] = timing_domain_df["risk_domain"].astype(str)
    timing_domain_df["timing"] = timing_domain_df["timing"].astype(str)

    top_domains = timing_domain_df["risk_domain"].value_counts().head(12).index
    timing_domain_df = timing_domain_df[timing_domain_df["risk_domain"].isin(top_domains)]

    domain_by_timing_pct = (pd.crosstab(timing_domain_df["timing"], timing_domain_df["risk_domain"], normalize="index") * 100).round(1)
    print("Risk domains by timing (% within timing):")
    display(domain_by_timing_pct)

    domain_by_timing_pct.to_csv(OUTPUT_PATH / "81_domain_by_timing_pct.csv")
else:
    print("MIT missing risk_domain or timing; skipping domain-by-timing.")

## Reporting Lag Proxy

In [None]:
if "url" in submissions_df.columns and "url" in reports_df.columns:
    merged_df = submissions_df.merge(reports_df, on="url", how="inner", suffixes=("_sub", "_rep"))

    merged_df["_incident_dt"] = pd.to_datetime(merged_df.get("incident_date"), errors="coerce", utc=True).dt.tz_convert(None)

    report_date_candidates = [
        "date_published_rep", "date_published",
        "date_submitted_rep", "date_submitted",
        "date_modified_rep", "date_modified",
        "date_downloaded_rep", "date_downloaded",
    ]
    lag_report_date_col = next((c for c in report_date_candidates if c in merged_df.columns), None)
    print("Lag proxy report date column:", lag_report_date_col)

    if lag_report_date_col is None:
        print("No report date column found after merge. Date-like columns:", [c for c in merged_df.columns if "date" in c])
    else:
        merged_df["_published_dt"] = pd.to_datetime(merged_df[lag_report_date_col], errors="coerce", utc=True).dt.tz_convert(None)
        lag_days = (merged_df["_published_dt"] - merged_df["_incident_dt"]).dt.days.dropna()

        print("Lag sample size:", len(lag_days))
        display(lag_days.describe())

        fig, ax = plt.subplots(figsize=(8, 4.5))
        lag_days.clip(lower=-30, upper=365).plot(kind="hist", bins=40, ax=ax)
        ax.set_title("Reporting Lag Proxy (reports - incident) [URL-merged subset]")
        ax.set_xlabel("Days (clipped -30 to 365)")
        ax.set_ylabel("Count")
        fig.savefig(OUTPUT_PATH / "82_reporting_lag_proxy_hist.png", bbox_inches="tight", dpi=200)
        plt.show()
else:
    print("Missing url in submissions or reports; skipping lag proxy.")

## Notes and Limitations

- MIT timing interpretations depend on taxonomy annotation quality.
- Lag uses URL-merged subset only; it is intentionally presented as a proxy.
- Defensive checks preserve runtime stability under schema variation.