# 04 Technical Failure Analysis (GMF)

**Purpose**  
Analyze GMF technical failure labels and their co-occurrence with AI goals and technologies at descriptive level.

**Outputs preserved:**
- `60_gmf_failures_top15.png`
- `61_gmf_failures_percent.png`

## Configuration

In [None]:
from pathlib import Path
import pandas as pd
from IPython.display import display

from src.notebook_utils import ensure_output_dir, load_data, pick_column_by_keywords, plot_barh_top, plot_percent_barh

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "data").exists() and (PROJECT_ROOT.parent / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
DATA_PATH = PROJECT_ROOT / "data"
OUTPUT_PATH = ensure_output_dir(PROJECT_ROOT / "outputs" / "figures")
TOP_N = 15
DATE_CANDIDATES = ["date"]

loaded_tables = load_data(DATA_PATH, tables=["incidents", "gmf"])
incidents_df = loaded_tables["incidents"]
gmf_df = loaded_tables["gmf"]

if incidents_df is None or gmf_df is None:
    raise FileNotFoundError("Required tables missing: incidents.csv or classifications_GMF.csv.")

print("GMF:", gmf_df.shape)
print("Incidents:", incidents_df.shape)

failure_col = pick_column_by_keywords(gmf_df, ["failure"], nice_to_have=["technical", "known", "ai"])
goal_col = pick_column_by_keywords(gmf_df, ["goal"], nice_to_have=["known", "ai"])
tech_col = pick_column_by_keywords(gmf_df, ["technology"], nice_to_have=["known", "ai"])

print("Chosen failure column:", failure_col)
print("Chosen goal column:", goal_col)
print("Chosen technology column:", tech_col)

if failure_col is None:
    print("No failure-like column found. Candidates containing 'fail':", [c for c in gmf_df.columns if "fail" in c])
else:
    if gmf_df[failure_col].dropna().empty:
        print(f"Warning: {failure_col} exists but is entirely null in this snapshot.")
    print("Distinct failure labels:", gmf_df[failure_col].nunique(dropna=True))
    print("Top 5 failure labels:", gmf_df[failure_col].dropna().astype(str).value_counts().head(5).to_dict())

## Failure Distributions

In [None]:
if failure_col is None:
    print("Cannot proceed: GMF failure column not found.")
else:
    plot_barh_top(
        gmf_df[failure_col],
        "GMF: Technical Failures (Top 15)",
        "Count",
        OUTPUT_PATH / "60_gmf_failures_top15.png",
        top_n=TOP_N,
    )
    plot_percent_barh(
        gmf_df[failure_col],
        "GMF: Technical Failures (% share)",
        OUTPUT_PATH / "61_gmf_failures_percent.png",
    )

## Co-occurrence Tables

In [None]:
if failure_col and goal_col:
    failure_goal_df = gmf_df.dropna(subset=[failure_col, goal_col]).copy()
    failure_goal_df[failure_col] = failure_goal_df[failure_col].astype(str)
    failure_goal_df[goal_col] = failure_goal_df[goal_col].astype(str)

    top_goals = failure_goal_df[goal_col].value_counts().head(8).index
    failure_goal_df = failure_goal_df[failure_goal_df[goal_col].isin(top_goals)]

    failure_by_goal_pct = (pd.crosstab(failure_goal_df[goal_col], failure_goal_df[failure_col], normalize="index") * 100).round(1)
    print("Failures by Goal (% within goal):")
    display(failure_by_goal_pct.iloc[:, :10])
else:
    print("Skipping Failures-by-Goal: missing goal or failure column.")

In [None]:
if failure_col and tech_col:
    failure_tech_df = gmf_df.dropna(subset=[failure_col, tech_col]).copy()
    failure_tech_df[failure_col] = failure_tech_df[failure_col].astype(str)
    failure_tech_df[tech_col] = failure_tech_df[tech_col].astype(str)

    top_technologies = failure_tech_df[tech_col].value_counts().head(8).index
    failure_tech_df = failure_tech_df[failure_tech_df[tech_col].isin(top_technologies)]

    failure_by_tech_pct = (pd.crosstab(failure_tech_df[tech_col], failure_tech_df[failure_col], normalize="index") * 100).round(1)
    print("Failures by Technology (% within technology):")
    if failure_by_tech_pct.shape[1] == 0:
        print("Crosstab produced no columns (likely sparse labels).")
    else:
        display(failure_by_tech_pct.iloc[:, :min(10, failure_by_tech_pct.shape[1])])
else:
    print("Skipping Failures-by-Technology: missing technology or failure column.")

In [None]:
if "incident_id" in gmf_df.columns and "incident_id" in incidents_df.columns:
    gmf_incidents = gmf_df.dropna(subset=["incident_id"]).copy()
    gmf_incidents["has_failure_label"] = gmf_incidents[failure_col].notna() if failure_col else False

    incident_failure_flags = (
        gmf_incidents.groupby("incident_id")["has_failure_label"]
        .max()
        .reset_index()
        .rename(columns={"has_failure_label": "gmf_has_failure"})
    )

    incident_core_df = incidents_df[["incident_id"]].drop_duplicates().merge(incident_failure_flags, on="incident_id", how="left")
    incident_core_df["gmf_has_failure"] = incident_core_df["gmf_has_failure"].fillna(False)

    print("Incidents with GMF failure labels:", incident_core_df["gmf_has_failure"].sum())
    print("Share of incidents with GMF failure labels:", f"{incident_core_df['gmf_has_failure'].mean():.1%}")
else:
    print("Missing incident_id in GMF or incidents table; cannot compute incident-level tagging.")

## Interpretation and Responsible Notes

- GMF coverage is a subset of incidents, so failure distributions apply to labeled records only.
- Co-occurrence tables are descriptive and do not imply causality.
- Defensive guards preserve deterministic execution when optional columns are missing.