In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import joblib

In [3]:
syn_data = pd.read_csv('cardio_data_syn.csv')
data = pd.read_csv('cardio_data_real.csv')

In [4]:
def train_and_eval(df, dataset_name):
    print(f"Training and evaluating on {dataset_name} dataset...")

    X = df.drop('Disease', axis=1)
    y = df['Disease']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = XGBClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)


    # param_grid = {
    # 'n_estimators': [50, 100, 200],
    # 'max_depth': [10, 20, 30, None],
    # 'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'max_features': ['sqrt', 'log2']
    # }

    # rf = RandomForestClassifier(random_state=42)
    # grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    # grid_search.fit(X_train_scaled, y_train)

    # # Best model
    # best_model = grid_search.best_estimator_

    # # Predictions
    # y_pred = best_model.predict(X_test_scaled)

    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Classification Report:\n', classification_report(y_test, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    print("------------------------------------------------------")

In [5]:
train_and_eval(syn_data, "Synthetic")
train_and_eval(data, "Real")

Training and evaluating on Synthetic dataset...
Accuracy: 0.8765
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.66      0.69       409
           1       0.91      0.93      0.92      1591

    accuracy                           0.88      2000
   macro avg       0.81      0.80      0.81      2000
weighted avg       0.87      0.88      0.87      2000

Confusion Matrix:
 [[ 271  138]
 [ 109 1482]]
------------------------------------------------------
Training and evaluating on Real dataset...
Accuracy: 0.9929577464788732
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       119
           1       1.00      0.99      1.00       449

    accuracy                           0.99       568
   macro avg       0.99      0.99      0.99       568
weighted avg       0.99      0.99      0.99       568

Confusion Matrix:
 [[118   1]
 [  3 446]]
------------------

In [6]:
df = pd.read_csv('cardio_data.csv')

In [7]:
df.shape

(12837, 10)

In [8]:
X = df.drop('Disease', axis=1)
y = df['Disease']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
model = XGBClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

In [11]:
y_pred = model.predict(X_test_scaled)

In [13]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Accuracy: 0.9088785046728972
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.72      0.77       530
           1       0.93      0.96      0.94      2038

    accuracy                           0.91      2568
   macro avg       0.87      0.84      0.85      2568
weighted avg       0.91      0.91      0.91      2568

Confusion Matrix:
 [[ 383  147]
 [  87 1951]]


In [14]:
%pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [16]:
joblib.dump(model, 'cardio_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [18]:
def predict_disease(input_data):
    try:
        model = joblib.load('cardio_model.pkl')
        scaler = joblib.load('scaler.pkl')

        input_data = np.array(input_data).reshape(1, -1)
        input_data_scaled = scaler.transform(input_data)

        prediction = model.predict(input_data_scaled)
        return 'Something is wrong' if prediction[0] == 1 else 'You are healthy'
    except Exception as e:
        return str(e)
        

In [36]:
example_healthy_1 = [180, 55, 100, 140, 118, 76, 72, 23.5, 0.8]
example_healthy_2 = [190, 60, 90, 130, 115, 74, 68, 22.0, 0.5]
example_healthy_3 = [170, 50, 95, 125, 110, 70, 65, 24.0, 0.9]
healthiest_person = [
    150,  # Cholesterol (mg/dL) - Ideal: < 170
    65,   # HDL Cholesterol (mg/dL) - Ideal: > 60
    80,   # LDL Cholesterol (mg/dL) - Ideal: < 100
    100,  # Triglycerides (mg/dL) - Ideal: < 150
    110,  # Systolic Blood Pressure (mm Hg) - Ideal: 90-120
    70,   # Diastolic Blood Pressure (mm Hg) - Ideal: 60-80
    60,   # Heart Rate (bpm) - Ideal: 60-100
    21.5, # BMI (kg/m²) - Ideal: 18.5-24.9
    0.3   # C-reactive Protein (mg/L) - Ideal: < 1.0
]
print(predict_disease(healthiest_person))

Something is wrong




In [21]:
X.columns

Index(['Cholesterol', 'HDL Cholesterol', 'LDL Cholesterol', 'Triglycerides',
       'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate',
       'BMI', 'C-reactive Protein'],
      dtype='object')

In [23]:
X.head()

Unnamed: 0,Cholesterol,HDL Cholesterol,LDL Cholesterol,Triglycerides,Systolic Blood Pressure,Diastolic Blood Pressure,Heart Rate,BMI,C-reactive Protein
0,173.76,50.26,82.93,115.35,95.57,61.43,97.58,18.96,2.31
1,126.73,42.14,109.15,97.77,107.43,71.12,86.65,23.87,1.2
2,133.71,48.44,93.24,58.89,91.37,73.27,77.27,22.2,2.34
3,126.17,56.53,72.41,90.65,94.18,63.21,69.03,19.58,1.06
4,181.42,44.43,78.78,91.44,93.55,60.11,93.66,22.21,0.28
