In [None]:
import sys, warnings, joblib, numpy as np, pandas as pd
from pathlib import Path
from urllib.request import urlretrieve
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

warnings.filterwarnings("ignore", category=UserWarning)

try:
    from imblearn.combine import SMOTETomek
    from imblearn.pipeline import Pipeline as ImbPipeline
except ImportError:
    sys.exit("❌  pip install -U imbalanced-learn")

DATASETS = {
    "heart": {
        "url": "https://raw.githubusercontent.com/sharmaroshan/Heart-UCI-Dataset/master/heart.csv",
        "target_aliases": ["target"],
        "categorical": ["sex", "cp", "fbs", "restecg",
                        "exang", "slope", "ca", "thal"],
        "extra_feats": lambda d: d.assign(
            age_chol_ratio=d.age / d.chol,
            bp_chol_product=d.trestbps * d.chol,
            pulse_pressure=d.trestbps - d.oldpeak * 10,
            cardiac_stress_indx=d.thalach / (1 + d.oldpeak),
        ),
    },
    "diabetes": {
        "url": "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv",
        "target_aliases": ["Outcome"],
        "categorical": [],
        "extra_feats": lambda d: d,
    },
    "stroke": {
        "url": ("https://gist.githubusercontent.com/aishwarya8615/"
                "d2107f828d3f904839cbcb7eaa85bd04/raw/"
                "healthcare-dataset-stroke-data.csv"),
        "target_aliases": ["stroke", "Stroke"],
        "categorical": ["gender", "ever_married", "work_type",
                        "Residence_type", "smoking_status"],
        "extra_feats": lambda d: d,
    },
}

CANDIDATES = {
    "logreg": (
        LogisticRegression(max_iter=300, class_weight="balanced"),
        {"clf__C": [0.1, 1.0, 10]},
    ),
    "rf": (
        RandomForestClassifier(n_jobs=-1, random_state=42,
                               class_weight="balanced"),
        {"clf__n_estimators": [200, 400],
         "clf__max_depth": [None, 10]},
    ),
    "gbc": (
        GradientBoostingClassifier(random_state=42),
        {"clf__n_estimators": [200, 400],
         "clf__learning_rate": [0.05, 0.1]},
    ),
}

def pick_target(df, aliases):
    for c in aliases:
        if c in df.columns:
            return c
    raise KeyError(f"No target column found among {aliases}")

def train_best(name, cfg):
    print(f"\n📦  Training {name.upper()}")
    fp = Path(f"{name}.csv")
    if not fp.exists():
        urlretrieve(cfg["url"], fp)

    df = cfg["extra_feats"](pd.read_csv(fp))
    tgt = pick_target(df, cfg["target_aliases"])
    X, y = df.drop(columns=[tgt]), df[tgt]

    cats = cfg["categorical"]
    nums = [c for c in X.columns if c not in cats]

    prep = ColumnTransformer([
        ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                          ("sc", StandardScaler())]), nums),
        ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                          ("oh", OneHotEncoder(handle_unknown="ignore"))]), cats)
    ])

    X_tr, X_val, y_tr, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42)

    best_auc, best_pipe = -1, None
    for label, (est, grid) in CANDIDATES.items():
        pipe = ImbPipeline([("prep", prep),
                            ("bal", SMOTETomek(random_state=42)),
                            ("clf", est)])
        gs = GridSearchCV(pipe, grid, cv=5, scoring="roc_auc",
                          n_jobs=-1, verbose=0)
        gs.fit(X_tr, y_tr)
        auc = roc_auc_score(y_val, gs.predict_proba(X_val)[:, 1])
        print(f"  • {label.upper():5s} AUC={auc:.3f}")
        if auc > best_auc:
            best_auc, best_pipe = auc, gs.best_estimator_

    print(f"🔝  Best = {best_auc:.3f}")
    calibrated = CalibratedClassifierCV(best_pipe, method="isotonic", cv=5)
    calibrated.fit(X_tr, y_tr)
    joblib.dump(calibrated, f"{name}_pipeline.joblib")

for name, cfg in DATASETS.items():
    train_best(name, cfg)

def predict_combo(record: dict):
    out = {}
    for disease, cfg in DATASETS.items():
        clf = joblib.load(f"{disease}_pipeline.joblib")
        row = cfg["extra_feats"](pd.DataFrame([record]).copy())
        missing = set(clf.feature_names_in_) - set(row.columns)
        for col in missing:
            row[col] = np.nan

        out[disease] = int(clf.predict(row)[0])

    bits = "".join(str(out[d]) for d in ("heart", "diabetes", "stroke"))
    out["combo_label"] = {
        "000": "None",
        "100": "Heart only",
        "010": "Diabetes only",
        "001": "Stroke only",
        "110": "Heart + Diabetes",
        "101": "Heart + Stroke",
        "011": "Diabetes + Stroke",
        "111": "All three diseases",
    }[bits]
    return out

if __name__ == "__main__":
    sample = dict(
        age=50, sex=1, cp=0, trestbps=120, chol=225, fbs=0,
        restecg=1, thalach=140, exang=0, oldpeak=1.0, slope=2,
        ca=0, thal=2, Pregnancies=1, Glucose=130, BloodPressure=80,
        SkinThickness=25, Insulin=100, BMI=28.0,
        DiabetesPedigreeFunction=0.3,
        gender="Male", ever_married="Yes", work_type="Private",
        Residence_type="Urban", smoking_status="never smoked",
    )
    print("\n🩺  Combo prediction:", predict_combo(sample))


📦  Training HEART
  • LOGREG AUC=0.883
  • RF    AUC=0.903
  • GBC   AUC=0.854
🔝  Best = 0.903

📦  Training DIABETES
  • LOGREG AUC=0.818
  • RF    AUC=0.822
  • GBC   AUC=0.825
🔝  Best = 0.825

📦  Training STROKE
  • LOGREG AUC=0.842
  • RF    AUC=0.780
  • GBC   AUC=0.807
🔝  Best = 0.842

🩺  Combo prediction: {'heart': 1, 'diabetes': 0, 'stroke': 0, 'combo_label': 'Heart only'}
