In [1]:
# Step 1: Install + Import dependencies
!pip install scikit-learn pandas joblib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import joblib




Load Dataset

In [2]:
from  google.colab import files
uploaded = files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [3]:
!kaggle datasets download -d redwankarimsony/heart-disease-data -p /content
!unzip -q /content/heart-disease-data.zip -d /content/heart-disease-data
!ls -ls /content/heart-disease-data

Dataset URL: https://www.kaggle.com/datasets/redwankarimsony/heart-disease-data
License(s): copyright-authors
Downloading heart-disease-data.zip to /content
  0% 0.00/12.4k [00:00<?, ?B/s]
100% 12.4k/12.4k [00:00<00:00, 55.2MB/s]
total 80
80 -rw-r--r-- 1 root root 79346 Sep 23  2020 heart_disease_uci.csv


In [4]:
df = pd.read_csv("/content/heart-disease-data/heart_disease_uci.csv")

In [7]:
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (920, 16)


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


**Feature selection**

In [16]:
# Convert sex to numeric: Male=1, Female=0
df["sex"] = df["sex"].map({"Male": 1, "Female": 0})

In [25]:
from sklearn.ensemble import RandomForestClassifier

# Choose a richer feature set
features = ["age", "sex", "trestbps", "chol", "thalch", "exang", "oldpeak", "ca"]
target = "num"

X = df[features]
y = (df[target] > 0).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Random Forest pipeline (no imputer for sex since it's already 0/1)
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.782608695652174
ROC-AUC: 0.8492945958871353

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.76      0.76        82
           1       0.80      0.80      0.80       102

    accuracy                           0.78       184
   macro avg       0.78      0.78      0.78       184
weighted avg       0.78      0.78      0.78       184





In [26]:
import joblib
joblib.dump(pipeline, "heart_model_pipeline.joblib")
print("Model saved as heart_model_pipeline.joblib")


Model saved as heart_model_pipeline.joblib


In [29]:
import joblib

joblib.dump(pipeline, "heart_model_pipeline.joblib")
print("Model saved as heart_model_pipeline.joblib")

def predict_from_payload(payload, model_path="heart_model_pipeline.joblib"):
    model = joblib.load(model_path)

    # Map input JSON -> feature row
    row = {
        "age": payload["age"],
        "sex": payload["sex"],  # already 0/1
        "trestbps": payload["bp_systolic"],
        "chol": payload["cholesterol"],
        # Fill extra features with safe defaults or map if available
        "thalch": payload.get("thalch", 150),
        "exang": payload.get("exang", 0),
        "oldpeak": payload.get("oldpeak", 1.0),
        "ca": payload.get("ca", 0)
    }

    df_row = pd.DataFrame([row])
    proba = model.predict_proba(df_row)[0][1]
    pred = model.predict(df_row)[0]

    return {"prediction": int(pred), "probability": float(proba)}

# Test with your example payload
payload = {
  "age": 45,
  "sex": 1,
  "bp_systolic": 138,
  "cholesterol": 220,
  "bmi": 29.4,
  "smoking": 1,
  "alcohol_level": 1,
  "exercise_freq": 1,
  "sleep_hours": 5.5,
  "stress_score": 7
}

print(predict_from_payload(payload))


Model saved as heart_model_pipeline.joblib
{'prediction': 0, 'probability': 0.185}


