In [1]:
%pip install --quiet pandas scikit-learn mlxtend matplotlib


Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_PATH = Path(r"C:\Users\sarve\Downloads\Final_Project_hospital_app\synthetic_health_risk_prediction\Health_Risk_Dataset.csv")

df = pd.read_csv(DATA_PATH)

# Standardize column names
df.columns = [c.strip().replace(" ", "_") for c in df.columns]

# Quick type fixes if present
if "On_Oxygen" in df.columns:
    df["On_Oxygen"] = df["On_Oxygen"].astype(str).str.strip().str.lower().map({"1":1,"0":0,"true":1,"false":0,"yes":1,"no":0}).fillna(df["On_Oxygen"]).astype("int", errors="ignore")
if "Consciousness" in df.columns:
    df["Consciousness"] = df["Consciousness"].astype(str).str.strip().str.upper()

# Clip impossible values (light sanity checks)
if "Oxygen_Saturation" in df.columns:
    df["Oxygen_Saturation"] = df["Oxygen_Saturation"].clip(lower=50, upper=100)
if "Temperature" in df.columns:
    df["Temperature"] = df["Temperature"].clip(lower=30, upper=43)

print(df.shape)
df.head()


(1000, 10)


Unnamed: 0,Patient_ID,Respiratory_Rate,Oxygen_Saturation,O2_Scale,Systolic_BP,Heart_Rate,Temperature,Consciousness,On_Oxygen,Risk_Level
0,P0522,25,96,1,97,107,37.5,A,0,Medium
1,P0738,28,92,2,116,151,38.5,P,1,High
2,P0741,29,91,1,79,135,38.4,A,0,High
3,P0661,24,96,1,95,92,37.3,A,0,Medium
4,P0412,20,96,1,97,97,37.4,A,0,Low


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
import numpy as np

assert "Risk_Level" in df.columns, "Expected a 'Risk_Level' column."

target_col = "Risk_Level"
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

pre = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols)
])

# Strong baseline that handles nonlinearity well:
clf = Pipeline([
    ("pre", pre),
    ("hgb", HistGradientBoostingClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc  = accuracy_score(y_test, y_pred)
mf1  = f1_score(y_test, y_pred, average="macro")
print(f"Multiclass Risk Stratification — Accuracy: {acc:.3f} | Macro-F1: {mf1:.3f}")
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=3))

# OPTIONAL: High vs Not-High binary for triage
if set(y.unique()) >= {"High"}:
    y_bin = (y == "High").astype(int)
    Xtr, Xte, ytr, yte = train_test_split(X, y_bin, stratify=y_bin, test_size=0.2, random_state=42)
    bin_clf = Pipeline([
        ("pre", pre),
        ("lr", LogisticRegression(max_iter=2000, class_weight="balanced", solver="liblinear"))
    ])
    bin_clf.fit(Xtr, ytr)
    yhat = bin_clf.predict(Xte)
    acc_b = accuracy_score(yte, yhat)
    f1_b  = f1_score(yte, yhat)
    print(f"\nBinary (High vs Not) — Accuracy: {acc_b:.3f} | F1: {f1_b:.3f}")
    print("Confusion matrix (labels=[NotHigh(0), High(1)]):\n", confusion_matrix(yte, yhat))


Multiclass Risk Stratification — Accuracy: 0.975 | Macro-F1: 0.976

Confusion matrix:
 [[55  0  1  0]
 [ 0 50  1  0]
 [ 0  2 59  0]
 [ 0  1  0 31]]

Classification report:
               precision    recall  f1-score   support

        High      1.000     0.982     0.991        56
         Low      0.943     0.980     0.962        51
      Medium      0.967     0.967     0.967        61
      Normal      1.000     0.969     0.984        32

    accuracy                          0.975       200
   macro avg      0.978     0.975     0.976       200
weighted avg      0.976     0.975     0.975       200


Binary (High vs Not) — Accuracy: 0.995 | F1: 0.991
Confusion matrix (labels=[NotHigh(0), High(1)]):
 [[144   0]
 [  1  55]]


In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Choose numeric clinical features for clustering
clus_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c != "Risk_Level"]
Xc = df[clus_cols].copy()

# Impute simple
Xc = Xc.fillna(Xc.median(numeric_only=True))
Xc_sc = StandardScaler().fit_transform(Xc)

# Try k=3..5 and pick the best silhouette
best = {}
for k in [3,4,5]:
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labs = km.fit_predict(Xc_sc)
    sil = silhouette_score(Xc_sc, labs)
    best[k] = (sil, km, labs)
best_k = max(best, key=lambda k: best[k][0])
sil, km, labs = best[best_k]

print(f"Chosen k={best_k} with silhouette={sil:.3f}")
df["cluster_kmeans"] = labs

# Cluster profiles
profile = df.groupby("cluster_kmeans")[clus_cols].mean().round(2)
if "Risk_Level" in df.columns:
    profile["pct_High"] = (df.groupby("cluster_kmeans")["Risk_Level"]
                             .apply(lambda s: (s=="High").mean()*100)).round(1)
profile


Chosen k=3 with silhouette=0.449


Unnamed: 0_level_0,Respiratory_Rate,Oxygen_Saturation,O2_Scale,Systolic_BP,Heart_Rate,Temperature,On_Oxygen,pct_High
cluster_kmeans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,18.78,94.71,1.0,115.05,89.21,37.42,0.11,0.8
1,24.75,89.75,2.0,94.15,109.85,38.69,0.57,51.6
2,28.26,87.53,1.0,85.07,121.05,39.07,0.61,100.0


In [4]:
from mlxtend.frequent_patterns import apriori, association_rules

work = df.copy()

# Discretize into clinically meaningful bins (change thresholds if needed)
cuts = {}

if "Oxygen_Saturation" in work.columns:
    work["SpO2_low"] = (work["Oxygen_Saturation"] < 92).astype(int)
if "Respiratory_Rate" in work.columns:
    work["RR_high"] = (work["Respiratory_Rate"] >= 24).astype(int)
if "Systolic_BP" in work.columns:
    work["SBP_low"] = (work["Systolic_BP"] < 100).astype(int)
if "Heart_Rate" in work.columns:
    work["HR_high"] = (work["Heart_Rate"] >= 110).astype(int)
if "Temperature" in work.columns:
    work["Temp_high"] = (work["Temperature"] >= 38.0).astype(int)
if "On_Oxygen" in work.columns:
    work["O2_yes"] = (work["On_Oxygen"] == 1).astype(int)
if "Consciousness" in work.columns:
    work["Consc_not_A"] = (~work["Consciousness"].isin(["A","ALERT"])).astype(int)

# Consequent: High risk (if available)
work["HighRisk"] = (work["Risk_Level"].astype(str) == "High").astype(int)

# Build one-hot transaction data (booleans)
bool_cols = [c for c in work.columns if work[c].dropna().isin([0,1]).all()]
basket = work[bool_cols].astype(bool)

# Frequent itemsets
freq = apriori(basket, min_support=0.05, use_colnames=True)
# Association rules with 'HighRisk' as consequent if present
rules = association_rules(freq, metric="lift", min_threshold=1.0)
if "HighRisk" in rules["consequents"].explode().unique():
    rules = rules[rules["consequents"].apply(lambda s: "HighRisk" in list(s))]

# Sort strongest rules
rules = (rules.sort_values(["lift","confidence","support"], ascending=False)
              .reset_index(drop=True))

# Clean display: turn frozensets into strings
rules["antecedents"] = rules["antecedents"].apply(lambda s: ", ".join(sorted(s)))
rules["consequents"] = rules["consequents"].apply(lambda s: ", ".join(sorted(s)))

print("Top rules (showing support, confidence, lift):")
rules[["antecedents","consequents","support","confidence","lift"]].head(10)


Top rules (showing support, confidence, lift):


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,"HR_high, O2_yes, SpO2_low","HighRisk, On_Oxygen, RR_high, SBP_low",0.09,0.849057,5.937459
1,"HR_high, On_Oxygen, SpO2_low","HighRisk, O2_yes, RR_high, SBP_low",0.09,0.849057,5.937459
2,"O2_yes, RR_high, SBP_low","HR_high, HighRisk, On_Oxygen, SpO2_low",0.09,0.629371,5.937459
3,"On_Oxygen, RR_high, SBP_low","HR_high, HighRisk, O2_yes, SpO2_low",0.09,0.629371,5.937459
4,"O2_yes, RR_high, SBP_low","HR_high, HighRisk, On_Oxygen, SpO2_low, Temp_high",0.072,0.503497,5.923488
5,"On_Oxygen, RR_high, SBP_low","HR_high, HighRisk, O2_yes, SpO2_low, Temp_high",0.072,0.503497,5.923488
6,"HR_high, O2_yes, SpO2_low, Temp_high","HighRisk, On_Oxygen, RR_high, SBP_low",0.072,0.847059,5.923488
7,"HR_high, On_Oxygen, SpO2_low, Temp_high","HighRisk, O2_yes, RR_high, SBP_low",0.072,0.847059,5.923488
8,"HR_high, O2_yes, SpO2_low, Temp_high","HighRisk, On_Oxygen, SBP_low",0.079,0.929412,5.882353
9,"HR_high, On_Oxygen, SpO2_low, Temp_high","HighRisk, O2_yes, SBP_low",0.079,0.929412,5.882353
