<a href="https://colab.research.google.com/github/Nity05/Health/blob/main/hackwell.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import shap
import matplotlib.pyplot as plt
import os

# Verify SHAP version
# print(f"SHAP version: {shap.version}") # Removed this line

np.random.seed(42)

# === Load dataset ===
try:
    data = pd.read_csv('chronic_disease_risk_dataset.csv')
    if 'patient_id' in data.columns:
        data = data.drop(columns=['patient_id'])
except FileNotFoundError:
    print("Error: Dataset file not found. Please ensure 'synthetic_realistic_patient_dataset.csv' exists.")
    exit(1)

# === Feature groups ===
feature_groups = {
    'demographics': ['age', 'sex', 'BMI', 'waist_circumference', 'chronic_condition',
                     'family_history', 'duration_of_condition', 'smoking_status',
                     'alcohol_use', 'vaccination_status'],
    'vitals': ['mean_systolic_bp', 'mean_diastolic_bp', 'bp_variability', 'mean_heart_rate',
               'resting_hr', 'hr_variability_index', 'mean_resp_rate', 'mean_spo2', 'min_spo2',
               'weight_change_30d', 'edema_present'],
    'labs': ['latest_hba1c', 'hba1c_trend', 'fasting_glucose', 'postprandial_glucose',
             'ldl_cholesterol', 'hdl_cholesterol', 'triglycerides', 'creatinine', 'egfr',
             'urine_albumin', 'c_reactive_protein', 'bnp_level', 'hemoglobin'],
    'medication': ['med_adherence_rate', 'missed_doses', 'refill_gap_days', 'insulin_use',
                   'insulin_dose_change', 'antihypertensive_use', 'diuretic_use', 'statin_use',
                   'medication_burden'],
    'lifestyle': ['avg_steps_per_day', 'sedentary_hours_avg', 'physical_activity_days',
                  'sleep_hours_avg', 'sleep_quality_index', 'diet_quality_index',
                  'fluid_intake_liters', 'stress_level_index', 'depression_score',
                  'anxiety_score', 'social_support_score'],
    'healthcare': ['outpatient_visits_last90', 'hospital_admissions_last180',
                   'emergency_visits_last180', 'icu_admissions_last180',
                   'missed_clinic_appointments', 'telehealth_usage']
}

# === Categorical columns ===
categorical_columns = {
    'demographics': ['sex', 'chronic_condition', 'family_history', 'smoking_status', 'alcohol_use', 'vaccination_status'],
    'vitals': ['edema_present'],
    'labs': [],
    'medication': ['insulin_use', 'antihypertensive_use', 'diuretic_use', 'statin_use'],
    'lifestyle': [],
    'healthcare': []
}

# Validate columns
all_features = [f for group in feature_groups.values() for f in group]
missing_cols = [col for col in all_features + ['deterioration_within_90d'] + [f"risk_{group}" for group in feature_groups] if col not in data.columns]
if missing_cols:
    print(f"Error: Missing columns in dataset: {missing_cols}")
    exit(1)

# === Preprocessing ===
def preprocess_features(data, features, categorical_cols):
    try:
        print(f"Preprocessing features: {features}")
        numeric_cols = [c for c in features if c not in categorical_cols]
        preprocessor = ColumnTransformer([
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), numeric_cols),
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
            ]), categorical_cols)
        ])
        X = data[features]
        X_transformed = preprocessor.fit_transform(X)

        feature_names = numeric_cols.copy()
        if categorical_cols:
            cat_step = preprocessor.named_transformers_['cat'].named_steps['onehot']
            cat_names = list(cat_step.get_feature_names_out(categorical_cols))
            feature_names += cat_names

        print(f"X_transformed shape: {X_transformed.shape}, Feature names: {feature_names}")
        return preprocessor, X_transformed, feature_names
    except Exception as e:
        print(f"Error in preprocess_features: {e}")
        raise

# === Train sub-model ===
def train_sub_model(X, y, model_type, categorical_cols, features):
    try:
        print(f"Training model with type: {model_type}, features: {features}, target unique values: {np.unique(y)}")
        preprocessor, X_transformed, feature_names = preprocess_features(X, features, categorical_cols)

        # Convert string labels to integers for models that require it
        y_numeric = y.astype(int)

        if model_type == 'logistic':
            model = LogisticRegression(random_state=42, max_iter=2000)
        elif model_type == 'random_forest':
            model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=5)
        elif model_type == 'xgboost':
            model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss', max_depth=4, learning_rate=0.05)
            y_numeric = y_numeric # Adjust labels for XGBoost to be 0-indexed
        elif model_type == 'lightgbm':
            model = LGBMClassifier(random_state=42, max_depth=4, learning_rate=0.05, n_estimators=150, verbose=-1)

        model.fit(X_transformed, y_numeric)
        y_pred_proba = model.predict_proba(X_transformed)
        print(f"y_pred_proba shape: {y_pred_proba.shape}")
        return model, preprocessor, y_pred_proba, feature_names
    except Exception as e:
        print(f"Error in train_sub_model: {e}")
        raise

# === Metrics ===
def evaluate_model(y_true, y_pred_proba, title):
    try:
        print(f"Evaluating {title}, y_true unique: {np.unique(y_true)}, y_pred_proba shape: {y_pred_proba.shape}")
        y_true_str = y_true.astype(str)

        if y_pred_proba.ndim == 1 or y_pred_proba.shape[1] == 1:
            print(f"Warning: y_pred_proba has shape {y_pred_proba.shape}, assuming binary classification with single probability")
            y_pred = (y_pred_proba >= 0.5).astype(int)
            y_pred_str = y_pred.astype(str)
            labels = sorted(np.unique(y_true_str))
        elif y_pred_proba.shape[1] == 2:
            y_pred = (y_pred_proba[:, 1] >= 0.5).astype(int)
            y_pred_str = y_pred.astype(str)
            labels = sorted(np.unique(y_true_str))
        else:
            class_labels = [str(cls) for cls in sorted(np.unique(y_true))]
            class_indices = np.argmax(y_pred_proba, axis=1)
            y_pred_str = np.array([class_labels[i] for i in class_indices])
            labels = class_labels

        conf_matrix = confusion_matrix(y_true_str, y_pred_str, labels=labels)
        class_report = classification_report(y_true_str, y_pred_str, labels=labels)

        print(f"\n=== {title} ===")
        print(f"Confusion Matrix:\n{conf_matrix}")
        print(f"Classification Report:\n{class_report}")
    except Exception as e:
        print(f"Error in evaluate_model: {e}")
        raise

# === Robust SHAP ===
def shap_summary_plot(model, X, feature_names, title):
    try:
        print(f"Generating SHAP plot for {title}, X shape: {X.shape}, feature_names: {feature_names}")

        # Tree models: RandomForest, XGBoost, LightGBM
        if hasattr(model, "predict_proba") and hasattr(model, "fit"):
            try:
                explainer = shap.TreeExplainer(model)
                shap_values = explainer(X, check_additivity=False)  # 🚀 disable additivity check
            except Exception:
                explainer = shap.Explainer(model, X)
                shap_values = explainer(X)
        else:
            explainer = shap.Explainer(model, X)
            shap_values = explainer(X)

        # Safe feature names
        feature_names_safe = feature_names if feature_names and len(feature_names) == shap_values.values.shape[1] \
                             else [f"feature_{i}" for i in range(shap_values.values.shape[1])]

        print(f"SHAP values shape: {shap_values.values.shape}, feature_names_safe: {feature_names_safe}")
        plt.figure()
        shap.summary_plot(shap_values, X, feature_names=feature_names_safe, show=False)
        plt.title(f'SHAP Summary - {title}')
        plt.tight_layout()
        os.makedirs('shap_plots', exist_ok=True)
        plt.savefig(f'shap_plots/shap_summary_{title}.png')
        plt.close()

        # Aggregate for multiclass
        if len(shap_values.values.shape) == 3:
            shap_abs = np.abs(shap_values.values).mean(axis=(0, 2))
        else:
            shap_abs = np.abs(shap_values.values).mean(axis=0)

        most_idx = np.argmax(shap_abs)
        print(f"Most contributing feature index: {most_idx}, feature: {feature_names_safe[most_idx]}")
        return shap_values, feature_names_safe
    except Exception as e:
        print(f"Error in shap_summary_plot: {e}")
        raise


# === Run pipeline ===
def run_pipeline(data, feature_groups, categorical_columns):
    try:
        global_target = 'deterioration_within_90d'

        print("Splitting data...")
        X = data.drop(columns=[global_target])
        y_global = data[global_target]
        print(f"X shape: {X.shape}, y_global unique: {np.unique(y_global)}")

        X_temp, X_test, y_temp, y_test = train_test_split(X, y_global, test_size=0.15, stratify=y_global, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, stratify=y_temp, random_state=42)
        print(f"Train shape: {X_train.shape}, Val shape: {X_val.shape}, Test shape: {X_test.shape}")

        model_types = {
            'demographics': 'logistic',
            'vitals': 'random_forest',
            'labs': 'xgboost',
            'medication': 'lightgbm',
            'lifestyle': 'random_forest',
            'healthcare': 'logistic'
        }

        sub_models_dict = {}
        risk_scores_train = pd.DataFrame(index=X_train.index)
        risk_scores_val = pd.DataFrame(index=X_val.index)
        risk_scores_test = pd.DataFrame(index=X_test.index)

        def bucketize_risk(x, colname, n_buckets=4):
            try:
                x = float(x)

                # ---- risk_demographics (2.5 → 10) ----
                if colname == "risk_demographics":
                    if n_buckets == 2:
                        return '0' if x < 7 else '1'
                    else:  # 4 buckets
                        if x < 5: return '0'
                        elif x < 7: return '1'
                        elif x < 9: return '2'
                        else: return '3'

                # ---- risk_vitals (0 → 2.22, skewed near 0) ----
                elif colname == "risk_vitals":
                    if n_buckets == 2:
                        return '0' if x < 0.2 else '1'
                    else:
                        if x < 0.1: return '0'
                        elif x < 0.5: return '1'
                        elif x < 1.0: return '2'
                        else: return '3'

                # ---- risk_labs (3.5 → 10) ----
                elif colname == "risk_labs":
                    if n_buckets == 2:
                        return '0' if x < 6.5 else '1'
                    else:
                        if x < 5: return '0'
                        elif x < 6.5: return '1'
                        elif x < 8: return '2'
                        else: return '3'

                # ---- risk_medication (0 → 3) ----
                elif colname == "risk_medication":
                    if n_buckets == 2:
                        return '0' if x < 1 else '1'
                    else:
                        if x < 0.5: return '0'
                        elif x < 1.0: return '1'
                        elif x < 1.5: return '2'
                        else: return '3'

                # ---- risk_lifestyle (0 → 1.25, mostly 0) ----
                elif colname == "risk_lifestyle":
                    if n_buckets == 2:
                        return '0' if x < 0.05 else '1'
                    else:
                        if x < 0.01: return '0'
                        elif x < 0.1: return '1'
                        elif x < 0.5: return '2'
                        else: return '3'

                # ---- risk_healthcare (0 → 8) ----
                elif colname == "risk_healthcare":
                    if n_buckets == 2:
                        return '0' if x < 4 else '1'
                    else:
                        if x < 2: return '0'
                        elif x < 4: return '1'
                        elif x < 6: return '2'
                        else: return '3'

                else:
                    return '0'

            except (ValueError, TypeError):
                print(f"Invalid value in bucketize_risk for {colname}: {x}")
                return '0'



        for group, features in feature_groups.items():
            print(f"\nTraining {group} sub-model...")
            target_col = f"risk_{group}"
            n_buckets=0
            if(group=="vitals" or group=="lifestyle"):
                n_buckets=2

            else:
                n_buckets=4
            if target_col not in X_train.columns:
                print(f"Error: Risk column {target_col} not found in dataset")
                raise KeyError(f"Risk column {target_col} missing")

            y_train_group = X_train[target_col].apply(lambda v: bucketize_risk(v, target_col,n_buckets))
            y_val_group = X_val[target_col].apply(lambda v: bucketize_risk(v, target_col,n_buckets))
            y_test_group = X_test[target_col].apply(lambda v: bucketize_risk(v, target_col,n_buckets))
            print(f"y_train_group unique: {np.unique(y_train_group)}")

            group_features = [f for f in features if f != target_col and f in data.columns]
            if not group_features:
                print(f"Error: No valid features for group {group}")
                raise ValueError(f"No valid features for group {group}")

            model, preprocessor, y_pred_train, feature_names = train_sub_model(
                X_train, y_train_group, model_types[group], categorical_columns[group], group_features
            )
            sub_models_dict[group] = {'model': model, 'preprocessor': preprocessor, 'feature_names': feature_names}

            X_val_transformed = preprocessor.transform(X_val[group_features])
            X_test_transformed = preprocessor.transform(X_test[group_features])
            print(f"X_val_transformed shape: {X_val_transformed.shape}, X_test_transformed shape: {X_test_transformed.shape}")

            risk_scores_train[group] = np.argmax(y_pred_train, axis=1)
            risk_scores_val[group] = np.argmax(model.predict_proba(X_val_transformed), axis=1)
            risk_scores_test[group] = np.argmax(model.predict_proba(X_test_transformed), axis=1)

            evaluate_model(y_test_group, model.predict_proba(X_test_transformed), f"{group} sub-model")

            shap_values, safe_feature_names = shap_summary_plot(model, X_test_transformed, feature_names, group)
            shap_abs = np.abs(shap_values.values).mean(axis=(0, 2)) if len(shap_values.values.shape) == 3 else np.abs(shap_values.values).mean(axis=0)
            most_idx = np.argmax(shap_abs)
            print(f"Most contributing feature ({group}): {safe_feature_names[most_idx]}")

        print("\nTraining fusion model...")
        fusion_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', max_depth=8, learning_rate=0.05)
        fusion_model.fit(risk_scores_train, y_train)  # Use y_train directly (binary: 0, 1)

        y_pred_test_fusion = fusion_model.predict(risk_scores_test)
        evaluate_model(y_test, fusion_model.predict_proba(risk_scores_test), "Fusion Model")

        return sub_models_dict, fusion_model, risk_scores_test, y_test
    except Exception as e:
        print(f"Error in run_pipeline: {e}")
        raise

# === Execute pipeline ===
try:
    sub_models, fusion_model, risk_scores_test, y_test = run_pipeline(data, feature_groups, categorical_columns)
except Exception as e:
    print(f"Pipeline failed: {e}")

Splitting data...
X shape: (30000, 66), y_global unique: [0 1]
Train shape: (20999, 66), Val shape: (4501, 66), Test shape: (4500, 66)

Training demographics sub-model...
y_train_group unique: ['0' '1' '2' '3']
Training model with type: logistic, features: ['age', 'sex', 'BMI', 'waist_circumference', 'chronic_condition', 'family_history', 'duration_of_condition', 'smoking_status', 'alcohol_use', 'vaccination_status'], target unique values: ['0' '1' '2' '3']
Preprocessing features: ['age', 'sex', 'BMI', 'waist_circumference', 'chronic_condition', 'family_history', 'duration_of_condition', 'smoking_status', 'alcohol_use', 'vaccination_status']
X_transformed shape: (20999, 14), Feature names: ['age', 'BMI', 'waist_circumference', 'duration_of_condition', 'sex_1', 'chronic_condition_1', 'chronic_condition_2', 'chronic_condition_3', 'family_history_1', 'smoking_status_1', 'smoking_status_2', 'alcohol_use_1', 'alcohol_use_2', 'vaccination_status_1']
y_pred_proba shape: (20999, 4)
X_val_trans

  shap.summary_plot(shap_values, X, feature_names=feature_names_safe, show=False)


Most contributing feature index: 6, feature: chronic_condition_2
Most contributing feature (demographics): chronic_condition_2

Training vitals sub-model...
y_train_group unique: ['0' '1']
Training model with type: random_forest, features: ['mean_systolic_bp', 'mean_diastolic_bp', 'bp_variability', 'mean_heart_rate', 'resting_hr', 'hr_variability_index', 'mean_resp_rate', 'mean_spo2', 'min_spo2', 'weight_change_30d', 'edema_present'], target unique values: ['0' '1']
Preprocessing features: ['mean_systolic_bp', 'mean_diastolic_bp', 'bp_variability', 'mean_heart_rate', 'resting_hr', 'hr_variability_index', 'mean_resp_rate', 'mean_spo2', 'min_spo2', 'weight_change_30d', 'edema_present']
X_transformed shape: (20999, 11), Feature names: ['mean_systolic_bp', 'mean_diastolic_bp', 'bp_variability', 'mean_heart_rate', 'resting_hr', 'hr_variability_index', 'mean_resp_rate', 'mean_spo2', 'min_spo2', 'weight_change_30d', 'edema_present_1']
y_pred_proba shape: (20999, 2)
X_val_transformed shape: (4

  shap.summary_plot(shap_values, X, feature_names=feature_names_safe, show=False)
  summary_legacy(
  summary_legacy(


Most contributing feature index: 0, feature: mean_systolic_bp
Most contributing feature (vitals): mean_systolic_bp

Training labs sub-model...
y_train_group unique: ['0' '1' '2' '3']
Training model with type: xgboost, features: ['latest_hba1c', 'hba1c_trend', 'fasting_glucose', 'postprandial_glucose', 'ldl_cholesterol', 'hdl_cholesterol', 'triglycerides', 'creatinine', 'egfr', 'urine_albumin', 'c_reactive_protein', 'bnp_level', 'hemoglobin'], target unique values: ['0' '1' '2' '3']
Preprocessing features: ['latest_hba1c', 'hba1c_trend', 'fasting_glucose', 'postprandial_glucose', 'ldl_cholesterol', 'hdl_cholesterol', 'triglycerides', 'creatinine', 'egfr', 'urine_albumin', 'c_reactive_protein', 'bnp_level', 'hemoglobin']
X_transformed shape: (20999, 13), Feature names: ['latest_hba1c', 'hba1c_trend', 'fasting_glucose', 'postprandial_glucose', 'ldl_cholesterol', 'hdl_cholesterol', 'triglycerides', 'creatinine', 'egfr', 'urine_albumin', 'c_reactive_protein', 'bnp_level', 'hemoglobin']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


y_pred_proba shape: (20999, 4)
X_val_transformed shape: (4501, 13), X_test_transformed shape: (4500, 13)
Evaluating labs sub-model, y_true unique: ['0' '1' '2' '3'], y_pred_proba shape: (4500, 4)

=== labs sub-model ===
Confusion Matrix:
[[ 334   22    0    0]
 [  14 1769   40    0]
 [   0   50 1688   13]
 [   0    0   39  531]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       356
           1       0.96      0.97      0.97      1823
           2       0.96      0.96      0.96      1751
           3       0.98      0.93      0.95       570

    accuracy                           0.96      4500
   macro avg       0.96      0.95      0.96      4500
weighted avg       0.96      0.96      0.96      4500

Generating SHAP plot for labs, X shape: (4500, 13), feature_names: ['latest_hba1c', 'hba1c_trend', 'fasting_glucose', 'postprandial_glucose', 'ldl_cholesterol', 'hdl_cholesterol', 'triglycerides', 'creatinine', 

  shap.summary_plot(shap_values, X, feature_names=feature_names_safe, show=False)


Most contributing feature index: 6, feature: triglycerides
Most contributing feature (labs): triglycerides

Training medication sub-model...
y_train_group unique: ['0' '1' '2' '3']
Training model with type: lightgbm, features: ['med_adherence_rate', 'missed_doses', 'refill_gap_days', 'insulin_use', 'insulin_dose_change', 'antihypertensive_use', 'diuretic_use', 'statin_use', 'medication_burden'], target unique values: ['0' '1' '2' '3']
Preprocessing features: ['med_adherence_rate', 'missed_doses', 'refill_gap_days', 'insulin_use', 'insulin_dose_change', 'antihypertensive_use', 'diuretic_use', 'statin_use', 'medication_burden']
X_transformed shape: (20999, 9), Feature names: ['med_adherence_rate', 'missed_doses', 'refill_gap_days', 'insulin_dose_change', 'medication_burden', 'insulin_use_1', 'antihypertensive_use_1', 'diuretic_use_1', 'statin_use_1']




y_pred_proba shape: (20999, 4)
X_val_transformed shape: (4501, 9), X_test_transformed shape: (4500, 9)




Evaluating medication sub-model, y_true unique: ['0' '1' '2' '3'], y_pred_proba shape: (4500, 4)

=== medication sub-model ===
Confusion Matrix:
[[ 631    4    0    0]
 [   0 1427   16    0]
 [   0    1 1490   13]
 [   0    0    1  917]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       635
           1       1.00      0.99      0.99      1443
           2       0.99      0.99      0.99      1504
           3       0.99      1.00      0.99       918

    accuracy                           0.99      4500
   macro avg       0.99      0.99      0.99      4500
weighted avg       0.99      0.99      0.99      4500

Generating SHAP plot for medication, X shape: (4500, 9), feature_names: ['med_adherence_rate', 'missed_doses', 'refill_gap_days', 'insulin_dose_change', 'medication_burden', 'insulin_use_1', 'antihypertensive_use_1', 'diuretic_use_1', 'statin_use_1']
SHAP values shape: (4500, 9, 4), feature_names_safe: 

  shap.summary_plot(shap_values, X, feature_names=feature_names_safe, show=False)


Most contributing feature index: 4, feature: medication_burden
Most contributing feature (medication): medication_burden

Training lifestyle sub-model...
y_train_group unique: ['0' '1']
Training model with type: random_forest, features: ['avg_steps_per_day', 'sedentary_hours_avg', 'physical_activity_days', 'sleep_hours_avg', 'sleep_quality_index', 'diet_quality_index', 'fluid_intake_liters', 'stress_level_index', 'depression_score', 'anxiety_score', 'social_support_score'], target unique values: ['0' '1']
Preprocessing features: ['avg_steps_per_day', 'sedentary_hours_avg', 'physical_activity_days', 'sleep_hours_avg', 'sleep_quality_index', 'diet_quality_index', 'fluid_intake_liters', 'stress_level_index', 'depression_score', 'anxiety_score', 'social_support_score']
X_transformed shape: (20999, 11), Feature names: ['avg_steps_per_day', 'sedentary_hours_avg', 'physical_activity_days', 'sleep_hours_avg', 'sleep_quality_index', 'diet_quality_index', 'fluid_intake_liters', 'stress_level_ind

  shap.summary_plot(shap_values, X, feature_names=feature_names_safe, show=False)
  summary_legacy(
  summary_legacy(


Most contributing feature index: 10, feature: social_support_score
Most contributing feature (lifestyle): social_support_score

Training healthcare sub-model...
y_train_group unique: ['0' '1' '2' '3']
Training model with type: logistic, features: ['outpatient_visits_last90', 'hospital_admissions_last180', 'emergency_visits_last180', 'icu_admissions_last180', 'missed_clinic_appointments', 'telehealth_usage'], target unique values: ['0' '1' '2' '3']
Preprocessing features: ['outpatient_visits_last90', 'hospital_admissions_last180', 'emergency_visits_last180', 'icu_admissions_last180', 'missed_clinic_appointments', 'telehealth_usage']
X_transformed shape: (20999, 6), Feature names: ['outpatient_visits_last90', 'hospital_admissions_last180', 'emergency_visits_last180', 'icu_admissions_last180', 'missed_clinic_appointments', 'telehealth_usage']
y_pred_proba shape: (20999, 4)
X_val_transformed shape: (4501, 6), X_test_transformed shape: (4500, 6)
Evaluating healthcare sub-model, y_true uniqu

  shap.summary_plot(shap_values, X, feature_names=feature_names_safe, show=False)


Most contributing feature index: 3, feature: icu_admissions_last180
Most contributing feature (healthcare): icu_admissions_last180

Training fusion model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating Fusion Model, y_true unique: [0 1], y_pred_proba shape: (4500, 2)

=== Fusion Model ===
Confusion Matrix:
[[ 139 1763]
 [ 192 2406]]
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.07      0.12      1902
           1       0.58      0.93      0.71      2598

    accuracy                           0.57      4500
   macro avg       0.50      0.50      0.42      4500
weighted avg       0.51      0.57      0.46      4500



<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [None]:
import numpy
print(numpy.__version__)


2.0.2


In [None]:
import joblib

joblib.dump(sub_models, "sub_models.joblib")
joblib.dump(fusion_model, "fusion_model.joblib")


['fusion_model.joblib']

In [None]:
from google.colab import files

# Download a single file
files.download("sub_models.joblib")

# Download another file
files.download("fusion_model.joblib")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

# Load sample
df = pd.read_csv("sample_patient.csv")

print(df.head(1))



In [None]:
import pickle

# Save the model to a file
with open("model.pkl", "wb") as f:
    pickle.dump(fusion_model, f)

In [None]:
from google.colab import files
files.download("model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pickle

# Save submodels
with open("sub_models.pkl", "wb") as f:
    pickle.dump(sub_models, f)

In [None]:
files.download("sub_models.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files

files.download("sub_models.joblib")
files.download("fusion_model.joblib")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>