# 03 Descriptive Statistics

**Purpose**  
Generate reproducible descriptive summaries for MIT, GMF, CSET, and report-corpus metadata while preserving original figure outputs.

**Outputs preserved:** MIT/GMF/CSET distributions and report metadata plots in `../outputs/figures/`.

## Configuration

In [None]:
import sys
from pathlib import Path
import pandas as pd
from IPython.display import display

PROJECT_ROOT = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / "src").exists()), Path.cwd())
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.notebook_utils import (
    ensure_output_dir,
    load_data,
    normalize_incident_id,
    pick_column_by_keywords,
    plot_barh_top,
    plot_percent_barh,
)

DATA_PATH = PROJECT_ROOT / "data"
OUTPUT_PATH = ensure_output_dir(PROJECT_ROOT / "outputs" / "figures")
TOP_N = 15
DATE_CANDIDATES = ["date_published", "date_submitted", "date_modified", "date_downloaded"]
REPORT_COLUMNS = ["url", "source_domain", "language", "tags", "date_published", "date_submitted", "date_modified", "date_downloaded"]

loaded_tables = load_data(
    DATA_PATH,
    tables=["incidents", "reports", "mit", "gmf", "cset"],
    reports_usecols=REPORT_COLUMNS,
)

incidents_df = loaded_tables["incidents"]
reports_df = loaded_tables["reports"]
mit_df = normalize_incident_id(loaded_tables["mit"])
gmf_df = normalize_incident_id(loaded_tables["gmf"])
cset_df = normalize_incident_id(loaded_tables["cset"])

if incidents_df is None or reports_df is None:
    raise FileNotFoundError("Required tables missing: incidents.csv or reports.csv.")

print("Incidents:", incidents_df.shape)
print("Reports (selected cols):", reports_df.shape, "cols:", reports_df.columns.tolist())
print("MIT:", None if mit_df is None else mit_df.shape)
print("GMF:", None if gmf_df is None else gmf_df.shape)
print("CSET:", None if cset_df is None else cset_df.shape)

## MIT Distributions

In [None]:
if mit_df is None:
    print("MIT classifications not found.")
else:
    for column_name, output_name in [
        ("risk_domain", "20_mit_risk_domain_top15.png"),
        ("intent", "21_mit_intent_top15.png"),
        ("entity", "22_mit_entity_top15.png"),
        ("timing", "23_mit_timing_top15.png"),
    ]:
        if column_name in mit_df.columns:
            plot_barh_top(
                mit_df[column_name],
                title=f"MIT: {column_name.replace('_', ' ').title()} (Top 15)",
                xlabel="Count",
                output_file=OUTPUT_PATH / output_name,
                top_n=TOP_N,
            )
        else:
            print("MIT missing column:", column_name)

    for column_name, output_name in [
        ("intent", "24_mit_intent_percent.png"),
        ("entity", "25_mit_entity_percent.png"),
        ("timing", "26_mit_timing_percent.png"),
    ]:
        if column_name in mit_df.columns:
            plot_percent_barh(
                mit_df[column_name],
                title=f"MIT: {column_name.title()} Distribution (%)",
                output_file=OUTPUT_PATH / output_name,
            )

In [None]:
if mit_df is not None and "risk_domain" in mit_df.columns:
    top_domains = mit_df["risk_domain"].dropna().astype(str).value_counts().head(12).index

    if "intent" in mit_df.columns:
        mit_intent_df = mit_df.dropna(subset=["risk_domain", "intent"]).copy()
        mit_intent_df = mit_intent_df[mit_intent_df["risk_domain"].astype(str).isin(top_domains)]
        intent_crosstab_pct = (pd.crosstab(mit_intent_df["risk_domain"], mit_intent_df["intent"], normalize="index") * 100).round(1)
        print("MIT: Risk Domain x Intent (% within domain)")
        display(intent_crosstab_pct)

    if "entity" in mit_df.columns:
        mit_entity_df = mit_df.dropna(subset=["risk_domain", "entity"]).copy()
        mit_entity_df = mit_entity_df[mit_entity_df["risk_domain"].astype(str).isin(top_domains)]
        entity_crosstab_pct = (pd.crosstab(mit_entity_df["risk_domain"], mit_entity_df["entity"], normalize="index") * 100).round(1)
        print("MIT: Risk Domain x Entity (% within domain)")
        display(entity_crosstab_pct)
else:
    print("MIT risk_domain not found; skipping cross-tabs.")

## GMF and CSET Distributions

In [None]:
if gmf_df is None:
    print("GMF classifications not found.")
else:
    goal_col = pick_column_by_keywords(gmf_df, ["goal"], nice_to_have=["known", "ai"])
    tech_col = pick_column_by_keywords(gmf_df, ["technology"], nice_to_have=["known", "ai"])
    failure_col = pick_column_by_keywords(gmf_df, ["failure"], nice_to_have=["technical", "known", "ai"])

    print("Chosen GMF goal column:", goal_col)
    print("Chosen GMF tech column:", tech_col)
    print("Chosen GMF failure column:", failure_col)

    if goal_col:
        plot_barh_top(gmf_df[goal_col], "GMF: Known AI Goal (Top 15)", "Count", OUTPUT_PATH / "40_gmf_goal_top15.png", top_n=TOP_N)
    if tech_col:
        plot_barh_top(gmf_df[tech_col], "GMF: Known AI Technology (Top 15)", "Count", OUTPUT_PATH / "41_gmf_tech_top15.png", top_n=TOP_N)
    if failure_col:
        plot_barh_top(gmf_df[failure_col], "GMF: Known AI Technical Failure (Top 15)", "Count", OUTPUT_PATH / "42_gmf_failure_top15.png", top_n=TOP_N)

if cset_df is None:
    print("CSET classifications not found.")
else:
    cset_candidates = [
        "harm_distribution_basis",
        "sector_of_deployment",
        "harm_basis",
        "sector",
        "protected_class",
        "basis",
    ]

    found_any = False
    for column_name in cset_candidates:
        if column_name in cset_df.columns:
            found_any = True
            plot_barh_top(
                cset_df[column_name],
                title=f"CSET: {column_name.replace('_', ' ').title()} (Top 15)",
                xlabel="Count",
                output_file=OUTPUT_PATH / f"30_cset_{column_name}_top15.png",
                top_n=TOP_N,
            )

    if not found_any:
        print("No expected CSET columns found in candidates list.")
        print("CSET columns:", cset_df.columns.tolist())

## Report Corpus Metadata

In [None]:
if "source_domain" in reports_df.columns:
    plot_barh_top(
        reports_df["source_domain"],
        "Report Corpus: Top Source Domains (Top 15)",
        "Report count",
        OUTPUT_PATH / "50_reports_top_source_domains.png",
        top_n=TOP_N,
    )

if "language" in reports_df.columns:
    plot_barh_top(
        reports_df["language"],
        "Report Corpus: Languages (Top 15)",
        "Report count",
        OUTPUT_PATH / "51_reports_languages.png",
        top_n=TOP_N,
    )

if "tags" in reports_df.columns:
    tags_series = reports_df["tags"].dropna().astype(str)

    # WHY: exploding unstructured long text can create large memory spikes.
    if tags_series.str.len().median() > 200:
        print("Tags appear text-heavy (median length > 200). Skipping explode-based tag analysis.")
    else:
        exploded_tags = tags_series.str.split(",").explode().str.strip()
        plot_barh_top(
            exploded_tags,
            "Report Corpus: Tags (Top 15)",
            "Count",
            OUTPUT_PATH / "52_reports_tags.png",
            top_n=TOP_N,
        )

## Interpretation and Limitations

- Descriptive distributions are preserved exactly and remain non-causal.
- Defensive checks prevent failures when optional taxonomy columns are absent.
- Report metadata loading intentionally excludes heavy text to keep execution reliable on large snapshots.