### Install Dependencies
Installs the `catboost` library, which is required for the CatBoost classifier.

In [63]:
!pip3 install catboost



### Import Libraries
Imports necessary libraries for data manipulation, model training, evaluation, and visualization.

In [64]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, confusion_matrix, precision_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

### Load Dataset
Loads the preprocessed cardiovascular disease dataset from a remote CSV file.

In [65]:
df = pd.read_csv('https://raw.githubusercontent.com/Tejeswar001/Heart-ML/main/Data/cvd_preprocessed_data.csv')
df.head()

Unnamed: 0,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Total Cholesterol (mg/dL),HDL (mg/dL),Fasting Blood Sugar (mg/dL),Smoking Status,...,Family History of CVD,CVD Risk Level,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score,BMI_calculated
0,0,-1.241797,-0.818103,-0.387342,-0.609472,-0.430438,0.819469,1.469608,-0.207264,0,...,0,1,-0.389697,-0.264199,-0.072148,-0.208656,1,0.420413,0.346382,-0.605314
1,0,0.690519,1.597125,-0.569215,1.731304,-0.720585,-0.786864,-0.344476,0.604483,1,...,1,2,-0.573158,-0.46269,0.574299,-0.854737,2,-0.673867,1.431275,1.726151
2,1,-0.233632,1.090706,0.431085,0.664951,0.385112,-1.309857,-0.603631,-0.88372,0,...,1,1,0.040886,0.14519,0.620474,0.07849,2,-1.126673,-0.313805,0.668382
3,0,-1.241797,0.662198,0.976704,0.066752,0.863464,-1.085717,0.562566,0.807419,1,...,0,1,0.98626,0.331275,0.805173,0.07849,2,-1.239875,0.325357,0.061701
4,0,0.94256,1.55817,1.06764,0.703964,-0.806845,-0.898934,0.756932,-1.188125,1,...,1,2,1.07799,-1.120193,0.712824,0.580997,3,-1.107806,-0.061505,0.706048


### Check Target Distribution
Displays the count of each class in the 'CVD Risk Level' column to understand the class balance.

In [66]:
df['CVD Risk Level'].value_counts()

Unnamed: 0_level_0,count
CVD Risk Level,Unnamed: 1_level_1
2,667
1,504
0,189


### Binarize Target Variable
Converts the 'CVD Risk Level' to a binary format: 1 for high risk (level 2) and 0 for others.

In [67]:
df['CVD Risk Level'] = df['CVD Risk Level'].apply(lambda x: 1 if x == 2 else 0)

### Verify Target Distribution
Checks the class counts again to confirm the binarization process.

In [68]:
df['CVD Risk Level'].value_counts()

Unnamed: 0_level_0,count
CVD Risk Level,Unnamed: 1_level_1
0,693
1,667


### Define Features and Target
Separates the features (X) from the target variable (y). Drops 'CVD Risk Level' and 'CVD Risk Score' from features.

In [69]:
X = df.drop(['CVD Risk Level','CVD Risk Score'], axis=1)
y = df['CVD Risk Level']

### Split Data into Train and Test Sets
Splits the dataset into training and testing sets with an 80-20 split, stratifying by the target variable.

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Define Evaluation Metric Function
Creates a helper function `eval_metric` to calculate and print Accuracy, F1 Score, and Recall.

In [71]:
def eval_metric(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Recall: {recall:.4f}")

### Train Logistic Regression Model
Trains a Logistic Regression model with balanced class weights and evaluates its performance.

In [72]:
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

y_train_pred = lr.predict(X_train)

eval_metric(y_test, y_pred)
eval_metric(y_train, y_train_pred)

Accuracy: 0.7574, F1 Score: 0.7574, Recall: 0.7574
Accuracy: 0.7454, F1 Score: 0.7454, Recall: 0.7454


### Train Random Forest Model
Trains a Random Forest Classifier with 100 estimators and balanced class weights, then evaluates it.

In [73]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

y_train_pred = rf.predict(X_train)

eval_metric(y_test, y_pred_rf)
eval_metric(y_train, y_train_pred)

Accuracy: 0.7610, F1 Score: 0.7609, Recall: 0.7610
Accuracy: 1.0000, F1 Score: 1.0000, Recall: 1.0000


### Train Support Vector Classifier (SVC)
Trains a Support Vector Classifier (SVC) with balanced class weights and evaluates it.

In [74]:
svc = SVC(class_weight='balanced', random_state=42)
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)
y_train_pred = svc.predict(X_train)

eval_metric(y_test, y_pred_svc)
eval_metric(y_train, y_train_pred)

Accuracy: 0.7610, F1 Score: 0.7610, Recall: 0.7610
Accuracy: 0.8410, F1 Score: 0.8409, Recall: 0.8410


### Train XGBoost Model
Trains an XGBoost Classifier and evaluates its performance on the test set.

In [75]:
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgboost.fit(X_train, y_train)
y_pred_xgb = xgboost.predict(X_test)
y_train_pred = xgboost.predict(X_train)

eval_metric(y_test, y_pred_xgb)
eval_metric(y_train, y_train_pred)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.7390, F1 Score: 0.7390, Recall: 0.7390
Accuracy: 1.0000, F1 Score: 1.0000, Recall: 1.0000


### Train CatBoost Model
Trains a CatBoost Classifier and evaluates its performance.

In [76]:
cat = CatBoostClassifier(random_state=42, verbose=0)
cat.fit(X_train, y_train)
y_pred_cat = cat.predict(X_test)
y_train_pred = cat.predict(X_train)

eval_metric(y_test, y_pred_cat)
eval_metric(y_train, y_train_pred)

Accuracy: 0.7610, F1 Score: 0.7610, Recall: 0.7610
Accuracy: 0.9917, F1 Score: 0.9917, Recall: 0.9917


### Hyperparameter Tuning with GridSearchCV
Performs hyperparameter tuning for Logistic Regression using GridSearchCV to find the best parameters based on recall.

In [77]:
# Parameter grid
param_grid = {
    'logreg__penalty': ['l1', 'l2', 'elasticnet'],
    'logreg__C': [0.01, 0.1, 1, 10, 100],
    'logreg__solver': ['saga'],   # saga supports l1, l2, elasticnet
    'logreg__l1_ratio': [0, 0.5, 1]  # only used for elasticnet
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'))
    ]),
    param_grid=param_grid,
    scoring='recall',   # or 'roc_auc', 'f1', etc.
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit on training data
grid_search.fit(X_train, y_train)

# Best results
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# Best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best Parameters: {'logreg__C': 1, 'logreg__l1_ratio': 0, 'logreg__penalty': 'l2', 'logreg__solver': 'saga'}
Best CV Score: 0.7377711162052547




Best Parameters: {'logreg__C': 1, 'logreg__l1_ratio': 0, 'logreg__penalty': 'l1', 'logreg__solver': 'saga'}
Best CV Score: 0.7389971673783452

### Evaluate Best Model
Evaluates the best model found by GridSearchCV on the test set, printing classification report, ROC-AUC, Recall, Accuracy, and Precision.

In [78]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("Recall:", recall_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.74      0.76       139
           1       0.74      0.78      0.76       133

    accuracy                           0.76       272
   macro avg       0.76      0.76      0.76       272
weighted avg       0.76      0.76      0.76       272

ROC-AUC: 0.8123546275761346
Recall: 0.7819548872180451
Accuracy: 0.7610294117647058
Precision: 0.7428571428571429
