# Step 8 â€” Bivariate Analysis: Outcome Severity

This notebook investigates relationships between key variables and the
clinical **Outcome** field, converted into a structured severity
category.

Goals:

- Define a **Severity Category** based on Outcome text.
- Analyze **Certificate (Source) vs. Severity**.
- Analyze **Top 10 Medications vs. Severity**.
- Check whether engineered flags (dosing, wrong med, protocol) are
  associated with critical outcomes.

This mirrors the bivariate step of the loan project, but the target is
clinical **harm / severity**, not product acceptance.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

# ---------------------------------------------------------
# 1. Load data and define Severity Category
# ---------------------------------------------------------
try:
    df_bi = pd.read_excel('Krista 240726 Final.xlsx', sheet_name='Medication')
except FileNotFoundError:
    df_bi = pd.read_csv('Krista 240726 Final.xlsx - Medication.csv')

def categorize_severity(text):
    text = str(text).lower()
    # Define severe keywords
    severe_terms = ['died', 'death', 'expired', 'cpr', 'arrest', 'hypoxia', 'intubated', 'seizure']
    # Define stable/no-harm keywords
    stable_terms = ['stable', 'no adverse', 'resolved', 'prevented', 'normal', 'unchanged']

    if any(x in text for x in severe_terms):
        return 'Critical/Severe'
    elif any(x in text for x in stable_terms):
        return 'No Harm/Stable'
    elif 'not documented' in text or text == 'nan':
        return 'Unknown'
    else:
        return 'Monitor/Intervention'

df_bi['Severity_Category'] = df_bi['Outcome'].apply(categorize_severity)

print("--- Target Variable Distribution (Severity) ---")
print(df_bi['Severity_Category'].value_counts())

# ---------------------------------------------------------
# 2. Certificate (Source) vs. Severity (Heatmap)
# ---------------------------------------------------------
plt.figure(figsize=(12, 6))
source_outcome = pd.crosstab(df_bi['Source'], df_bi['Severity_Category'])
sns.heatmap(source_outcome, annot=True, fmt='d', cmap='YlOrRd', linewidths=.5)
plt.title('Certificate (Source) vs. Outcome Severity')
plt.ylabel('Certificate')
plt.xlabel('Outcome Severity')
plt.tight_layout()
plt.show()

# ---------------------------------------------------------
# 3. Top 10 Medications vs. Severity (Clustered Bar Chart)
# ---------------------------------------------------------
top_meds_list = df_bi['Medication 1'].value_counts().head(10).index
df_top_meds = df_bi[df_bi['Medication 1'].isin(top_meds_list)]

plt.figure(figsize=(12, 6))
sns.countplot(
    data=df_top_meds,
    y='Medication 1',
    hue='Severity_Category',
    order=top_meds_list,
    palette='magma'
)
plt.title('Top 10 Medications by Outcome Severity')
plt.xlabel('Count of Events')
plt.ylabel('Medication')
plt.legend(title='Severity', loc='lower right')
plt.tight_layout()
plt.show()

# ---------------------------------------------------------
# 4. Pattern Flags vs. Severity (Severe outcome rate)
# ---------------------------------------------------------
# Binary target: 1 = Critical/Severe, 0 = Other
df_bi['Is_Critical'] = (df_bi['Severity_Category'] == 'Critical/Severe').astype(int)

# Ensure engineered flags exist; if not, recreate simple versions
if 'Flag_Dosing_Error' not in df_bi.columns:
    df_bi['Flag_Dosing_Error'] = df_bi['Pattern Specifics'].str.contains(
        r"dosing|max dose|volume|overdose|underdose",
        case=False, na=False
    ).astype(int)
if 'Flag_Wrong_Med' not in df_bi.columns:
    df_bi['Flag_Wrong_Med'] = df_bi['Pattern Specifics'].str.contains(
        r"wrong med|wrong medication|instead of|incorrect medication",
        case=False, na=False
    ).astype(int)
if 'Flag_Protocol_Error' not in df_bi.columns:
    df_bi['Flag_Protocol_Error'] = df_bi['Pattern Specifics'].str.contains(
        r"protocol|checklist|policy|procedure",
        case=False, na=False
    ).astype(int)

flags = ['Flag_Dosing_Error', 'Flag_Wrong_Med', 'Flag_Protocol_Error']

print("\n--- Severe Outcome Rate by Error Type ---")
for flag in flags:
    rate = df_bi[df_bi[flag] == 1]['Is_Critical'].mean() * 100
    print(f"{flag}: {rate:.1f}% of flagged events are Critical/Severe")
