# Step 9 â€” Clinical Risk Model (Decision Tree)

This notebook builds a **clinical risk model** to predict which
medication error events are most likely to result in a
**Critical/Severe** outcome.

Goals:

- Define a binary target (`Is_Critical`) from the free-text Outcome.
- Use clinically meaningful features:
  - Certificate / Source
  - Branch (Air vs Ground)
  - Grouped high-risk medications
  - Pattern flags (dosing error, wrong med, protocol error)
- Train a **DecisionTreeClassifier** with class balancing.
- Report Accuracy, Recall for critical events, and Feature Importance.

This mirrors the modeling step from the loan assignment but focuses on
**patient harm risk** instead of product uptake.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

# ---------------------------------------------------------
# 1. Load the Medication data
# ---------------------------------------------------------
try:
    df_model = pd.read_excel('Krista 240726 Final.xlsx', sheet_name='Medication')
except FileNotFoundError:
    df_model = pd.read_csv('Krista 240726 Final.xlsx - Medication.csv')

print("Medication sheet shape:", df_model.shape)

# ---------------------------------------------------------
# 2. Define the target: Is_Critical (1) vs Non-Critical (0)
# ---------------------------------------------------------
def categorize_severity_binary(text):
    text = str(text).lower()
    severe_terms = ['died', 'death', 'expired', 'cpr', 'arrest', 'hypoxia', 'intubated', 'seizure']
    if any(x in text for x in severe_terms):
        return 1  # Critical outcome
    return 0      # No harm / stable / monitoring

df_model['Is_Critical'] = df_model['Outcome'].apply(categorize_severity_binary)

print("Target distribution (Is_Critical):")
print(df_model['Is_Critical'].value_counts())

# ---------------------------------------------------------
# 3. Ensure engineered flags exist (from earlier feature notebook)
# ---------------------------------------------------------
if 'Flag_Dosing_Error' not in df_model.columns:
    df_model['Flag_Dosing_Error'] = df_model['Pattern Specifics'].str.contains(
        r"dosing|max dose|volume|overdose|underdose",
        case=False, na=False
    ).astype(int)

if 'Flag_Wrong_Med' not in df_model.columns:
    df_model['Flag_Wrong_Med'] = df_model['Pattern Specifics'].str.contains(
        r"wrong med|wrong medication|instead of|incorrect medication",
        case=False, na=False
    ).astype(int)

if 'Flag_Protocol_Error' not in df_model.columns:
    df_model['Flag_Protocol_Error'] = df_model['Pattern Specifics'].str.contains(
        r"protocol|checklist|policy|procedure",
        case=False, na=False
    ).astype(int)

# ---------------------------------------------------------
# 4. Medication grouping and feature selection
# ---------------------------------------------------------
top_10_meds = df_model['Medication 1'].value_counts().nlargest(10).index
df_model['Med_Grouped'] = df_model['Medication 1'].apply(
    lambda x: x if x in top_10_meds else 'Other'
)

feature_cols = [
    'Source',            # Certificate (AEL, GFL, MTC, REACH, AMR, etc.)
    'Branch',            # Air vs Ground
    'Med_Grouped',       # High-risk medication grouping
    'Flag_Dosing_Error',
    'Flag_Wrong_Med',
    'Flag_Protocol_Error'
]

X = df_model[feature_cols].copy()
y = df_model['Is_Critical']

# One-hot encode categorical features
X = pd.get_dummies(X, columns=['Source', 'Branch', 'Med_Grouped'], drop_first=True)

print("Feature matrix shape after encoding:", X.shape)

# ---------------------------------------------------------
# 5. Train/test split (stratified on the rare critical events)
# ---------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

print(f"Training Data: {X_train.shape[0]} events")
print(f"Testing Data:  {X_test.shape[0]} events")
print(f"Critical Events in Train Set: {y_train.sum()} ({y_train.mean():.1%} of train)")


In [None]:
# ---------------------------------------------------------
# 6. Train Decision Tree model with class balancing
# ---------------------------------------------------------
risk_model = DecisionTreeClassifier(
    random_state=42,
    class_weight='balanced'
)

risk_model.fit(X_train, y_train)

# ---------------------------------------------------------
# 7. Evaluate model performance
# ---------------------------------------------------------
y_pred = risk_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

print("--- Model Performance ---")
print(f"Overall Accuracy: {acc:.2%}")
print(f"Critical Event Capture Rate (Recall): {rec:.2%}")

print("\nConfusion Matrix [ [TN FP], [FN TP] ]:")
print(cm)

# ---------------------------------------------------------
# 8. Feature importance (drivers of risk)
# ---------------------------------------------------------
feat_importances = pd.Series(risk_model.feature_importances_, index=X_train.columns)
feat_importances = feat_importances.sort_values(ascending=False)

print("\n--- Top 10 Feature Importances ---")
print(feat_importances.head(10))
