# AI-Powered Cardiovascular Risk Prediction System

This project predicts cardiovascular disease risk using machine learning.
It evaluates multiple models using:
- K-Fold Cross Validation
- Hyperparameter Tuning
- Accuracy and ROC-AUC metrics

The best-performing model is saved for deployment
and integrated with a React + FastAPI frontend.


# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

import joblib
import json

# Load Dataset

In [3]:
df = pd.read_csv("cleaned_cardio.csv")
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,21.96712
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,34.927679
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1,23.507805
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,28.710479
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0,23.011177


# Data Exploration

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68325 entries, 0 to 68324
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           68325 non-null  int64  
 1   age          68325 non-null  int64  
 2   gender       68325 non-null  int64  
 3   height       68325 non-null  int64  
 4   weight       68325 non-null  float64
 5   ap_hi        68325 non-null  int64  
 6   ap_lo        68325 non-null  int64  
 7   cholesterol  68325 non-null  int64  
 8   gluc         68325 non-null  int64  
 9   smoke        68325 non-null  int64  
 10  alco         68325 non-null  int64  
 11  active       68325 non-null  int64  
 12  cardio       68325 non-null  int64  
 13  BMI          68325 non-null  float64
dtypes: float64(2), int64(12)
memory usage: 7.3 MB


In [7]:
df.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
count,68325.0,68325.0,68325.0,68325.0,68325.0,68325.0,68325.0,68325.0,68325.0,68325.0,68325.0,68325.0,68325.0,68325.0
mean,49976.734519,52.825174,1.348891,164.443264,73.961404,126.557263,81.242473,1.363586,1.224691,0.088006,0.053319,0.803352,0.493846,27.381074
std,28845.381376,6.770346,0.476623,7.845831,13.919499,16.430065,9.29912,0.678155,0.57054,0.283305,0.22467,0.397467,0.499966,5.035133
min,0.0,29.0,1.0,120.0,28.0,80.0,40.0,1.0,1.0,0.0,0.0,0.0,0.0,15.012197
25%,25009.0,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,23.875115
50%,50021.0,53.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,26.298488
75%,74873.0,58.0,2.0,170.0,82.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0,30.110991
max,99999.0,64.0,2.0,207.0,180.0,200.0,120.0,3.0,3.0,1.0,1.0,1.0,1.0,50.0


# Feature Target

In [9]:
# Create BMI
df["BMI"] = df["weight"] / ((df["height"] / 100) ** 2)

# Encode gender (1=female, 2=male â†’ 0/1)
df["gender"] = df["gender"].map({1: 0, 2: 1})

# Drop any missing values
df = df.dropna()

# Features
X = df[[
    "age",
    "ap_hi",
    "ap_lo",
    "BMI",
    "cholesterol",
    "gluc",
    "smoke",
    "alco",
    "active",
    "gender"
]]

# Target
y = df["cardio"]

# Train-Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# K-Fold

In [13]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Model Pipeline & HyperParameter Tuning

In [15]:
models = {
    "Logistic Regression": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", LogisticRegression(max_iter=1000))
        ]),
        "params": {
            "model__C": [0.01, 0.1, 1, 10]
        }
    },

    "KNN": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", KNeighborsClassifier())
        ]),
        "params": {
            "model__n_neighbors": [3, 5, 7, 9]
        }
    },

    "Decision Tree": {
        "pipeline": Pipeline([
            ("model", DecisionTreeClassifier(random_state=42))
        ]),
        "params": {
            "model__max_depth": [None, 5, 10, 20]
        }
    },

    "Random Forest": {
        "pipeline": Pipeline([
            ("model", RandomForestClassifier(random_state=42))
        ]),
        "params": {
            "model__n_estimators": [100, 200],
            "model__max_depth": [None, 10, 20]
        }
    }
}

# Model Training & Cross Validation

In [18]:
results = []
best_models = {}

for name, config in models.items():
    print(f"\nTraining {name}...")

    grid = GridSearchCV(
        config["pipeline"],
        config["params"],
        cv=cv,
        scoring="roc_auc",
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)

    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "ROC_AUC": roc_auc
    })

    best_models[name] = best_model



Training Logistic Regression...

Training KNN...

Training Decision Tree...

Training Random Forest...


# Model Comparision

In [19]:
results_df = pd.DataFrame(results).sort_values(
    by=["ROC_AUC", "Accuracy"], ascending=False
)

results_df

Unnamed: 0,Model,Accuracy,ROC_AUC
3,Random Forest,0.736919,0.802793
2,Decision Tree,0.733992,0.793966
0,Logistic Regression,0.726894,0.792472
1,KNN,0.715185,0.768459


# Select Best Model

In [22]:
best_model_name = results_df.iloc[0]["Model"]
best_model = best_models[best_model_name]

print("Best Model Selected:", best_model_name)

Best Model Selected: Random Forest


# Final Evaluation 

In [24]:
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7369191364800586
ROC-AUC: 0.8027932435219911

Classification Report:

              precision    recall  f1-score   support

           0       0.72      0.79      0.75      6917
           1       0.76      0.68      0.72      6748

    accuracy                           0.74     13665
   macro avg       0.74      0.74      0.74     13665
weighted avg       0.74      0.74      0.74     13665


Confusion Matrix:

[[5460 1457]
 [2138 4610]]


# Risk Category

In [26]:
def risk_category(prob):
    if prob < 0.30:
        return "Low Risk"
    elif prob < 0.70:
        return "Medium Risk"
    else:
        return "High Risk"

# Save Model

In [28]:
joblib.dump(best_model, "best_heart_risk_model.pkl")
print("Best model saved successfully!")

Best model saved successfully!


# Model MetaData for Frontend

In [30]:
model_metadata = {
    "model_name": best_model_name,
    "accuracy": float(results_df.iloc[0]["Accuracy"]),
    "roc_auc": float(results_df.iloc[0]["ROC_AUC"]),
    "features": list(X.columns)
}

with open("model_metadata.json", "w") as f:
    json.dump(model_metadata, f, indent=4)

print("Model metadata saved!")

Model metadata saved!
