### Install Dependencies
Installs the `catboost` library, which is required for the CatBoost classifier.

In [3]:
!pip3 install catboost

Defaulting to user installation because normal site-packages is not writeable


### Import Libraries
Imports necessary libraries for data manipulation, model training, evaluation, and visualization.

In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, confusion_matrix, precision_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

### Load Dataset
Loads the preprocessed cardiovascular disease dataset from a remote CSV file.

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/Tejeswar001/Heart-ML/main/Data/cvd_preprocessed_data.csv')
df.head()

Unnamed: 0,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Total Cholesterol (mg/dL),HDL (mg/dL),Fasting Blood Sugar (mg/dL),Smoking Status,...,Family History of CVD,CVD Risk Level,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score,BMI_calculated
0,0,-1.241797,-0.818103,-0.387342,-0.609472,-0.430438,0.819469,1.469608,-0.207264,0,...,0,1,-0.389697,-0.264199,-0.072148,-0.208656,1,0.420413,0.346382,-0.605314
1,0,0.690519,1.597125,-0.569215,1.731304,-0.720585,-0.786864,-0.344476,0.604483,1,...,1,2,-0.573158,-0.46269,0.574299,-0.854737,2,-0.673867,1.431275,1.726151
2,1,-0.233632,1.090706,0.431085,0.664951,0.385112,-1.309857,-0.603631,-0.88372,0,...,1,1,0.040886,0.14519,0.620474,0.07849,2,-1.126673,-0.313805,0.668382
3,0,-1.241797,0.662198,0.976704,0.066752,0.863464,-1.085717,0.562566,0.807419,1,...,0,1,0.98626,0.331275,0.805173,0.07849,2,-1.239875,0.325357,0.061701
4,0,0.94256,1.55817,1.06764,0.703964,-0.806845,-0.898934,0.756932,-1.188125,1,...,1,2,1.07799,-1.120193,0.712824,0.580997,3,-1.107806,-0.061505,0.706048


### Check Target Distribution
Displays the count of each class in the 'CVD Risk Level' column to understand the class balance.

In [6]:
df['CVD Risk Level'].value_counts()

CVD Risk Level
2    667
1    504
0    189
Name: count, dtype: int64

### Binarize Target Variable
Converts the 'CVD Risk Level' to a binary format: 1 for high risk (level 2) and 0 for others.

In [7]:
df['CVD Risk Level'] = df['CVD Risk Level'].apply(lambda x: 1 if x == 2 else 0)

### Verify Target Distribution
Checks the class counts again to confirm the binarization process.

In [8]:
df['CVD Risk Level'].value_counts()

CVD Risk Level
0    693
1    667
Name: count, dtype: int64

### Define Features and Target
Separates the features (X) from the target variable (y). Drops 'CVD Risk Level' and 'CVD Risk Score' from features.

In [9]:
X = df.drop(['CVD Risk Level','CVD Risk Score'], axis=1)
y = df['CVD Risk Level']

### Initialize Results Dictionary
Creates a dictionary to store and compare performance metrics of different models.

In [10]:
result = {'Model': [], 'Accuracy': [], 'F1 Score': [], 'Recall': [], 'Precision': [], 'ROC AUC': []}

### Define VIF Calculation Function
Defines a function to calculate Variance Inflation Factor (VIF) for detecting multicollinearity.

In [11]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(X):
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

ModuleNotFoundError: No module named 'statsmodels'

### Calculate VIF
Calculates VIF for the features to identify high multicollinearity.

In [None]:
vif = calculate_vif(X)
vif[vif['VIF'] > 10]

Unnamed: 0,Variable,VIF
2,Weight (kg),56.018195
3,Height (m),30.658927
4,BMI,69021.632298
5,Abdominal Circumference (cm),104.571303
6,Total Cholesterol (mg/dL),10.003334
13,Height (cm),16.349543
14,Waist-to-Height Ratio,126.098306
19,BMI_calculated,69150.966625


### Drop High VIF Features
Removes features with high VIF to reduce multicollinearity.

In [None]:
drop_column = ['BMI',
'BMI_calculated',
'Height (m)',
'Waist-to-Height Ratio'
]
X = X.drop(drop_column, axis=1)

### Re-calculate VIF
Checks VIF again to ensure multicollinearity is reduced.

In [None]:
calculate_vif(X)

Unnamed: 0,Variable,VIF
0,Sex,1.811226
1,Age,1.025097
2,Weight (kg),1.006613
3,Abdominal Circumference (cm),1.01659
4,Total Cholesterol (mg/dL),9.939403
5,HDL (mg/dL),1.659793
6,Fasting Blood Sugar (mg/dL),1.032301
7,Smoking Status,1.944258
8,Diabetes Status,1.906088
9,Physical Activity Level,2.268204


### Split Data into Train and Test Sets
Splits the dataset into training and testing sets with an 80-20 split, stratifying by the target variable.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Define Evaluation Metric Function
Creates a helper function `eval_metric` to calculate and print Accuracy, F1 Score, and Recall.

In [None]:
def eval_metric(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Recall: {recall:.4f}")

### Train Logistic Regression Model
Trains a Logistic Regression model with balanced class weights and evaluates its performance.

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

y_train_pred = lr.predict(X_train)

eval_metric(y_test, y_pred)
eval_metric(y_train, y_train_pred)
print(classification_report(y_test,y_pred))
result['Model'].append('Logistic Regression')
result['Accuracy'].append(accuracy_score(y_test, y_pred))
result['F1 Score'].append(f1_score(y_test, y_pred, average='weighted'))
result['Recall'].append(recall_score(y_test, y_pred, average='weighted'))
result['Precision'].append(precision_score(y_test, y_pred, average='weighted'))
result['ROC AUC'].append(roc_auc_score(y_test, y_pred))

Accuracy: 0.7574, F1 Score: 0.7573, Recall: 0.7574
Accuracy: 0.7436, F1 Score: 0.7435, Recall: 0.7436
              precision    recall  f1-score   support

           0       0.78      0.73      0.76       139
           1       0.74      0.78      0.76       133

    accuracy                           0.76       272
   macro avg       0.76      0.76      0.76       272
weighted avg       0.76      0.76      0.76       272



### Train Random Forest Model
Trains a Random Forest Classifier with 100 estimators and balanced class weights, then evaluates it.

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

y_train_pred = rf.predict(X_train)

eval_metric(y_test, y_pred_rf)
eval_metric(y_train, y_train_pred)
print(classification_report(y_test, y_pred_rf))
result['Model'].append('Random Forest')
result['Accuracy'].append(accuracy_score(y_test, y_pred_rf))
result['F1 Score'].append(f1_score(y_test, y_pred_rf, average='weighted'))
result['Recall'].append(recall_score(y_test, y_pred_rf, average='weighted'))
result['Precision'].append(precision_score(y_test, y_pred_rf, average='weighted'))
result['ROC AUC'].append(roc_auc_score(y_test, y_pred_rf))

Accuracy: 0.7574, F1 Score: 0.7570, Recall: 0.7574
Accuracy: 1.0000, F1 Score: 1.0000, Recall: 1.0000
              precision    recall  f1-score   support

           0       0.75      0.79      0.77       139
           1       0.77      0.72      0.74       133

    accuracy                           0.76       272
   macro avg       0.76      0.76      0.76       272
weighted avg       0.76      0.76      0.76       272



In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

y_train_pred = rf.predict(X_train)

eval_metric(y_test, y_pred_rf)
eval_metric(y_train, y_train_pred)
print(classification_report(y_test, y_pred_rf))
result['Model'].append('Random Forest Optimized')
result['Accuracy'].append(accuracy_score(y_test, y_pred_rf))
result['F1 Score'].append(f1_score(y_test, y_pred_rf, average='weighted'))
result['Recall'].append(recall_score(y_test, y_pred_rf, average='weighted'))
result['Precision'].append(precision_score(y_test, y_pred_rf, average='weighted'))
result['ROC AUC'].append(roc_auc_score(y_test, y_pred_rf))

Accuracy: 0.7537, F1 Score: 0.7536, Recall: 0.7537
Accuracy: 0.8024, F1 Score: 0.8024, Recall: 0.8024
              precision    recall  f1-score   support

           0       0.78      0.73      0.75       139
           1       0.73      0.78      0.76       133

    accuracy                           0.75       272
   macro avg       0.75      0.75      0.75       272
weighted avg       0.76      0.75      0.75       272



### Hyperparameter Tuning for SVC
Performs GridSearchCV to find the best hyperparameters for the Support Vector Classifier.

In [None]:
param_grid = {
    'svm__C': [1e-2, 1e-1, 1, 10, 100],
    'svm__gamma': ['scale', 1e-3, 1e-2, 1e-1, 1]
}

grid = GridSearchCV(
    estimator=Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC(class_weight='balanced', probability=True, random_state=42))
    ]),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(X_train, y_train)


### Train Support Vector Classifier (SVC)
Trains a Support Vector Classifier (SVC) with balanced class weights and evaluates it.

In [None]:
svc = SVC(class_weight='balanced', C=1.0, random_state=42)
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)
y_train_pred = svc.predict(X_train)

eval_metric(y_test, y_pred_svc)
eval_metric(y_train, y_train_pred)
print(classification_report(y_test, y_pred_svc))
result['Model'].append('Support Vector Classifier')
result['Accuracy'].append(accuracy_score(y_test, y_pred_svc))
result['F1 Score'].append(f1_score(y_test, y_pred_svc, average='weighted'))
result['Recall'].append(recall_score(y_test, y_pred_svc, average='weighted'))
result['Precision'].append(precision_score(y_test, y_pred_svc, average='weighted'))
result['ROC AUC'].append(roc_auc_score(y_test, y_pred_svc))

Accuracy: 0.7757, F1 Score: 0.7758, Recall: 0.7757
Accuracy: 0.8373, F1 Score: 0.8372, Recall: 0.8373
              precision    recall  f1-score   support

           0       0.78      0.78      0.78       139
           1       0.77      0.77      0.77       133

    accuracy                           0.78       272
   macro avg       0.78      0.78      0.78       272
weighted avg       0.78      0.78      0.78       272



### Train XGBoost Model
Trains an XGBoost Classifier and evaluates its performance on the test set.

In [None]:
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgboost.fit(X_train, y_train)
y_pred_xgb = xgboost.predict(X_test)
y_train_pred = xgboost.predict(X_train)

eval_metric(y_test, y_pred_xgb)
eval_metric(y_train, y_train_pred)
print(classification_report(y_test, y_pred_xgb))
result['Model'].append('XGBoost')
result['Accuracy'].append(accuracy_score(y_test, y_pred_xgb))
result['F1 Score'].append(f1_score(y_test, y_pred_xgb, average='weighted'))
result['Recall'].append(recall_score(y_test, y_pred_xgb, average='weighted'))
result['Precision'].append(precision_score(y_test, y_pred_xgb, average='weighted'))
result['ROC AUC'].append(roc_auc_score(y_test, y_pred_xgb))

Accuracy: 0.7353, F1 Score: 0.7345, Recall: 0.7353
Accuracy: 1.0000, F1 Score: 1.0000, Recall: 1.0000
              precision    recall  f1-score   support

           0       0.72      0.78      0.75       139
           1       0.75      0.68      0.72       133

    accuracy                           0.74       272
   macro avg       0.74      0.73      0.73       272
weighted avg       0.74      0.74      0.73       272



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


### Train CatBoost Model
Trains a CatBoost Classifier and evaluates its performance.

In [None]:
cat = CatBoostClassifier(random_state=42, verbose=0)
cat.fit(X_train, y_train)
y_pred_cat = cat.predict(X_test)
y_train_pred = cat.predict(X_train)

eval_metric(y_test, y_pred_cat)
eval_metric(y_train, y_train_pred)
print(classification_report(y_test, y_pred_cat))
result['Model'].append('CatBoost')
result['Accuracy'].append(accuracy_score(y_test, y_pred_cat))
result['F1 Score'].append(f1_score(y_test, y_pred_cat, average='weighted'))
result['Recall'].append(recall_score(y_test, y_pred_cat, average='weighted'))
result['Precision'].append(precision_score(y_test, y_pred_cat, average='weighted'))
result['ROC AUC'].append(roc_auc_score(y_test, y_pred_cat))

Accuracy: 0.7647, F1 Score: 0.7647, Recall: 0.7647
Accuracy: 0.9926, F1 Score: 0.9926, Recall: 0.9926
              precision    recall  f1-score   support

           0       0.77      0.77      0.77       139
           1       0.76      0.76      0.76       133

    accuracy                           0.76       272
   macro avg       0.76      0.76      0.76       272
weighted avg       0.76      0.76      0.76       272



### Hyperparameter Tuning with GridSearchCV
Performs hyperparameter tuning for Logistic Regression using GridSearchCV to find the best parameters based on recall.

In [None]:
# Parameter grid
param_grid = {
    'logreg__penalty': ['l1', 'l2', 'elasticnet'],
    'logreg__C': [0.01, 0.1, 1, 10, 100],
    'logreg__solver': ['saga'],   # saga supports l1, l2, elasticnet
    'logreg__l1_ratio': [0, 0.5, 1]  # only used for elasticnet
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'))
    ]),
    param_grid=param_grid,
    scoring='recall',   # or 'roc_auc', 'f1', etc.
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit on training data
grid_search.fit(X_train, y_train)

# Best results
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# Best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best Parameters: {'logreg__C': 10, 'logreg__l1_ratio': 0, 'logreg__penalty': 'l1', 'logreg__solver': 'saga'}
Best CV Score: 0.7359372244754011




Best Parameters: {'logreg__C': 1, 'logreg__l1_ratio': 0, 'logreg__penalty': 'l1', 'logreg__solver': 'saga'}
Best CV Score: 0.7389971673783452

Best Parameters: {'logreg__C': 0.1, 'logreg__l1_ratio': 0, 'logreg__penalty': 'l2', 'logreg__solver': 'saga'}

Best CV Score: 0.7190971609945336

### Evaluate Best Model
Evaluates the best model found by GridSearchCV on the test set, printing classification report, ROC-AUC, Recall, Accuracy, and Precision.

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("Recall:", recall_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
result['Model'].append('Tuned Logistic Regression')
result['Accuracy'].append(accuracy_score(y_test, y_pred))
result['F1 Score'].append(f1_score(y_test, y_pred, average='weighted'))
result['Recall'].append(recall_score(y_test, y_pred, average='weighted'))
result['Precision'].append(precision_score(y_test, y_pred, average='weighted'))
result['ROC AUC'].append(roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.78      0.73      0.76       139
           1       0.74      0.79      0.76       133

    accuracy                           0.76       272
   macro avg       0.76      0.76      0.76       272
weighted avg       0.76      0.76      0.76       272

ROC-AUC: 0.8079731703359119
Recall: 0.7894736842105263
Accuracy: 0.7610294117647058
Precision: 0.7394366197183099


### Compare Model Performance
Creates a DataFrame from the results and sorts models by Recall.

In [None]:
result_df = pd.DataFrame(result)
result_df.sort_values(by=['Recall','Accuracy'], ascending=False)

Unnamed: 0,Model,Accuracy,F1 Score,Recall,Precision,ROC AUC
3,Support Vector Classifier,0.775735,0.77575,0.775735,0.775787,0.775707
5,CatBoost,0.764706,0.764706,0.764706,0.764706,0.764591
6,Tuned Logistic Regression,0.761029,0.760942,0.761029,0.762524,0.807973
0,Logistic Regression,0.757353,0.7573,0.757353,0.75856,0.757884
1,Random Forest,0.757353,0.756985,0.757353,0.757932,0.756586
2,Random Forest Optimized,0.753676,0.753587,0.753676,0.75515,0.754287
4,XGBoost,0.735294,0.734518,0.735294,0.736627,0.734192


### Save Best Model
Saves the best performing model (Logistic Regression) to a file for later use.
Saves the best performing model (SVM) to a file for later use.

In [None]:
import joblib
joblib.dump(best_model, 'binary_logistic_regression_model.pkl')
joblib.dump(svc, 'binary_svc_model.pkl')

['binary_svc_model.pkl']