# 05 Misuse vs Systemic Failure (MIT x GMF)

**Purpose**  
Join MIT and GMF at incident level to compare intent distributions and failure co-occurrence patterns.

**Outputs preserved:**
- `70_intent_among_gmf.png`
- `71_failure_by_intent_pct.csv`

## Configuration

In [None]:
from pathlib import Path
import pandas as pd
from IPython.display import display

from src.notebook_utils import ensure_output_dir, load_data, pick_column_by_keywords

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "data").exists() and (PROJECT_ROOT.parent / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
DATA_PATH = PROJECT_ROOT / "data"
OUTPUT_PATH = ensure_output_dir(PROJECT_ROOT / "outputs" / "figures")
TOP_N = 15
DATE_CANDIDATES = ["date"]

loaded_tables = load_data(DATA_PATH, tables=["incidents", "mit", "gmf"])
incidents_df = loaded_tables["incidents"]
mit_df = loaded_tables["mit"]
gmf_df = loaded_tables["gmf"]

if incidents_df is None or mit_df is None or gmf_df is None:
    raise FileNotFoundError("Required tables missing: incidents.csv, classifications_MIT.csv, or classifications_GMF.csv.")

intent_col = "intent" if "intent" in mit_df.columns else None
domain_col = "risk_domain" if "risk_domain" in mit_df.columns else None
failure_col = pick_column_by_keywords(gmf_df, ["failure"], nice_to_have=["technical", "known", "ai"])

print("Incidents:", incidents_df.shape)
print("MIT:", mit_df.shape)
print("GMF:", gmf_df.shape)
print("MIT intent column:", intent_col)
print("MIT domain column:", domain_col)
print("GMF failure column:", failure_col)

## Incident-level Join

In [None]:
if "incident_id" not in mit_df.columns or "incident_id" not in gmf_df.columns:
    print("Missing incident_id in MIT/GMF; cannot join.")
else:
    mit_incident_df = mit_df[["incident_id"] + ([intent_col] if intent_col else []) + ([domain_col] if domain_col else [])].drop_duplicates()
    gmf_incident_df = gmf_df[["incident_id"] + ([failure_col] if failure_col else [])].drop_duplicates()

    joined_df = mit_incident_df.merge(gmf_incident_df, on="incident_id", how="inner")
    print("Joined MIT x GMF incidents:", joined_df.shape)
    display(joined_df.head())

## Intent and Failure Mix

In [None]:
import matplotlib.pyplot as plt

if "joined_df" in globals() and intent_col:
    intent_counts = joined_df[intent_col].dropna().astype(str).value_counts()
    fig, ax = plt.subplots(figsize=(8, 4.5))
    intent_counts.sort_values().plot(kind="barh", ax=ax)
    ax.set_title("MIT Intent distribution among GMF-labeled incidents")
    ax.set_xlabel("Incident count")
    fig.savefig(OUTPUT_PATH / "70_intent_among_gmf.png", bbox_inches="tight", dpi=200)
    plt.show()

    intent_pct = (intent_counts / intent_counts.sum() * 100).round(1)
    print("Intent % among GMF-labeled incidents:")
    display(intent_pct)

In [None]:
if "joined_df" in globals() and intent_col and failure_col:
    intent_failure_df = joined_df.dropna(subset=[intent_col, failure_col]).copy()
    intent_failure_df[intent_col] = intent_failure_df[intent_col].astype(str)
    intent_failure_df[failure_col] = intent_failure_df[failure_col].astype(str)

    top_failures = intent_failure_df[failure_col].value_counts().head(12).index
    intent_failure_df = intent_failure_df[intent_failure_df[failure_col].isin(top_failures)]

    failure_by_intent_pct = (pd.crosstab(intent_failure_df[intent_col], intent_failure_df[failure_col], normalize="index") * 100).round(1)
    print("Failure type mix by intent (% within intent):")
    display(failure_by_intent_pct)

    failure_by_intent_pct.to_csv(OUTPUT_PATH / "71_failure_by_intent_pct.csv")

## Interpretation and Limitations

- Analysis is restricted to incidents labeled by both MIT and GMF.
- Intent labels come from MIT taxonomy; failure labels come from GMF taxonomy.
- Crosstab results are descriptive co-occurrence patterns, not causal claims.