In [4]:
import pandas as pd

# load dataset
df = pd.read_csv("combined_patient_dataset_clean.csv")

# assume column with patient identifier is 'patient_id'
for pid, group in df.groupby("Patient_ID"):
    group.to_csv(f"Patients/patient_{pid}.csv", index=False)


In [1]:
!pip install pandas numpy xgboost shap streamlit matplotlib seaborn joblib

^C


In [50]:
# ---- JUPYTER NOTEBOOK SETUP ----

# Install required packages (only first time)


import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
import xgboost as xgb
import shap
import warnings
warnings.filterwarnings("ignore")

# ---------- CONFIG ----------
DATA_FOLDER = "Patients"
MODEL_FOLDER = "models"
TARGETS = ["Kidney_Diagnosis","Heart_Diagnosis","Diabetes_Diagnosis"]
os.makedirs(MODEL_FOLDER, exist_ok=True)

# ---------- STREAMLIT APP CODE ----------
# Save this to a .py file
streamlit_code = """
# ---- JUPYTER NOTEBOOK SETUP ----
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
import shap
import warnings
warnings.filterwarnings("ignore")

# ---------- CONFIG ----------
DATA_FOLDER = "Patients"
MODEL_FOLDER = "models"
os.makedirs(MODEL_FOLDER, exist_ok=True)

# Automatically detect all model files in the folder
TARGETS = [f.replace(".joblib","") for f in os.listdir(MODEL_FOLDER) if f.endswith(".joblib")]

st.title("Chronic Care 90-Day Risk Prediction")

# -----------------------------
# Load models
# -----------------------------
models = {}
for target in TARGETS:
    model_path = os.path.join(MODEL_FOLDER, f"{target}.joblib")
    models[target] = joblib.load(model_path)

# -----------------------------
# Load patient CSVs from folder
# -----------------------------
patients = {}
if os.path.exists(DATA_FOLDER):
    for fname in os.listdir(DATA_FOLDER):
        if fname.endswith(".csv"):
            df = pd.read_csv(os.path.join(DATA_FOLDER, fname))
            df['Patient_ID'] = df['Patient_ID'].astype(str)
            if 'date' not in df.columns:
                df['date'] = pd.date_range(end=pd.Timestamp.today(), periods=len(df))
            patients[df['Patient_ID'].iloc[0]] = df
patient_ids = list(patients.keys())

# -----------------------------
# Patient selection or CSV upload
# -----------------------------
uploaded_file = st.file_uploader("Or upload a patient CSV", type="csv")

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    df['Patient_ID'] = df['Patient_ID'].astype(str)
    if 'date' not in df.columns:
        df['date'] = pd.date_range(end=pd.Timestamp.today(), periods=len(df))
    st.write("### Uploaded Patient Data (last 10 rows)")
    st.dataframe(df.tail(10))
elif patient_ids:
    patient_id = st.selectbox("Select Patient from folder", patient_ids)
    df = patients[patient_id]
    st.write("### Raw Patient Data (last 10 rows)")
    st.dataframe(df.tail(10))
else:
    st.error("No patient data found in folder and no CSV uploaded!")
    st.stop()

# -----------------------------
# Feature engineering
# -----------------------------
feats = {}
for col in df.select_dtypes(include=[np.number]).columns:
    feats[f"{col}_mean"] = df[col].mean()
    feats[f"{col}_std"] = df[col].std() if df[col].std() > 0 else df[col].mean()*0.1
    feats[f"{col}_last"] = df[col].iloc[-1]

for col in df.select_dtypes(exclude=[np.number]).columns:
    if col in ['Patient_ID','date']: continue
    feats[f"{col}_last"] = str(df[col].iloc[-1])

X_pred = pd.DataFrame([feats])

# Encode categorical columns to numeric
for col in X_pred.select_dtypes(include='object').columns:
    X_pred[col] = X_pred[col].astype('category').cat.codes

# -----------------------------
# Risk Predictions
# -----------------------------
st.write("## Risk Predictions")
for target, model in models.items():
    try:
        # Align columns: fill missing *_mean/_std with *_last if available
        expected_cols = model.get_booster().feature_names
        for c in expected_cols:
            if c not in X_pred.columns:
                if "_mean" in c:
                    base = c.replace("_mean","_last")
                    X_pred[c] = X_pred[base] if base in X_pred.columns else 0
                elif "_std" in c:
                    base = c.replace("_std","_last")
                    X_pred[c] = 0.1*X_pred[base] if base in X_pred.columns else 0
                else:
                    X_pred[c] = 0

        X_pred_ordered = X_pred[expected_cols]

        prob = model.predict_proba(X_pred_ordered)[0,1]
        st.metric(f"Predicted risk of {target.replace('_',' ')}", f"{prob*100:.1f}%")

        # SHAP explanations
        explainer = shap.TreeExplainer(model)
        sv = explainer.shap_values(X_pred_ordered)
        shap_series = pd.Series(sv[0], index=X_pred_ordered.columns)
        st.write("Top drivers:")
        for feat in shap_series.abs().sort_values(ascending=False).head(3).index:
            st.write("-", feat)
    except Exception as e:
        st.error(f"Prediction failed for {target}: {e}")


# -----------------------------
# Trend plots: 3 per row with fixed scales
# -----------------------------
df['day'] = pd.to_datetime(df['date']).dt.day
st.write("## Trends Over Time")

# Fixed ranges for specific columns
fixed_ranges = {
    "systolic_bp": (50, 200),
    "diastolic_bp": (50, 120),
    "heart_rate": (40, 180),
    "glucose": (50, 300),
    "creatinine": (0, 5),
    "weight": (30, 150),
    "steps": (0, 20000)
}

cols_to_plot = [c for c in df.columns if c not in ['Patient_ID','date','day']]
n = len(cols_to_plot)
rows = (n // 3) + (1 if n % 3 else 0)

for r in range(rows):
    fig, axs = plt.subplots(1, 3, figsize=(15, 3))
    for i in range(3):
        idx = r*3 + i
        if idx >= n: 
            axs[i].axis('off')
            continue
        col = cols_to_plot[idx]
        series = pd.to_numeric(df[col], errors='coerce')
        if series.dropna().empty:  # Skip fully non-numeric columns
            axs[i].axis('off')
            continue
        axs[i].plot(df['day'], series, marker='o')

        # Apply fixed range if defined
        col_lower = col.lower()
        if col_lower in fixed_ranges:
            ymin, ymax = fixed_ranges[col_lower]
        else:
            ymin, ymax = series.min(), series.max()
            if pd.isna(ymin) or pd.isna(ymax):
                axs[i].axis('off')
                continue
            if ymin == ymax:
                margin = 5
            else:
                margin = (ymax - ymin) * 0.2
            ymin -= margin
            ymax += margin

        axs[i].set_ylim(ymin, ymax)
        axs[i].set_title(col)
        axs[i].set_xlabel("Day")
    st.pyplot(fig)



"""

# Save to file
app_path = "risk_app_jupyter.py"
with open(app_path, "w") as f:
    f.write(streamlit_code)

print("Streamlit app saved as:", app_path)


Streamlit app saved as: risk_app_jupyter.py


In [8]:
!streamlit run risk_app_jupyter.py


^C
