## **Gradient Boosting in Regards to Heart Disease Classification, plus SHAP Visualisation for explaining the Models Nature**

Required external pip imports will be retrieved and downloaded in the below pip install code cell for the notebook requirments to be resolved.

In [None]:
%pip install pandas pd numpy scikit-learn matplotlib seaborn kagglehub shap xgboost lightgbm dask dask-ml distributed

Now the notebook will import the downloaded pip modules ensuring they can be linked, retrieved and used globally in the notebook.

In [None]:
import numpy as np # for performing more advanced operations on arrays, such as converting the heart disease dataset columns to float64 and int64 respectively as well as creating partions from the x training set
import pandas as pd # for converting the heart disease dataset to a more robustly interfacable pandas dataframe array for working with the various ML Models and helper methods
import matplotlib.pyplot as plt # plotting roc scores et al of the gradient boosting models
import seaborn as sns # for heatmap plotting the correlation matrix of the models

import kagglehub # for retrieving and downloading the heart disease dataset from kaggle

# Gradient Boosting Models
import xgboost as xgb # the xgboost model
import lightgbm as lgb # the lightgbm model
import shap # explaining how the two gradient boosting models made their predictions, i.e. feature importance

from sklearn.preprocessing import StandardScaler # for scaling the heart disease dataset features
from sklearn.model_selection import train_test_split # for splitting the heart disease dataset into training, testing and validation sets
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score # for generating model classification reports, confusion matrices and roc auc scores
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV # for selecting best hyperparameters for the gradient boosting models, both using 10 fold cross validation

# Dask for distributed learning of the LightGBM model
from dask.distributed import Client # contructing the dask client for distributed learning
from dask_ml.model_selection import GridSearchCV as DaskGridSearchCV # selecting best hyperparameters for the LightGBM model using dask distributed learning
import dask.array as da # data collection for the dask distributed learning client

import warnings # hiding warnings to clean up evaluation output

warnings.filterwarnings('ignore') # filter out any warnings in the notebook cell outputs

In [None]:
path = kagglehub.dataset_download("redwankarimsony/heart-disease-data") # get path to downloaded kaggle heart disease dataset files

print("Path to dataset files:", path) # output to user that path

In [None]:
heart_data = path + "/heart_disease_uci.csv" # retrieve the dataset by appending the filename to the enclosing path

In [None]:
heart_disease_df = pd.read_csv(heart_data) #convert heart disease csv data to a pandas dataframe

In [None]:
heart_disease_df.head() # head the dataset to see that it has been converted to a pandas dataframe successfully

In [None]:
heart_disease_df.shape # get rows and columns size of the heart disease dataset 

In [None]:
heart_disease_df["num"] = heart_disease_df["num"].apply(lambda x: 1 if x > 0 else 0) # make target num feature binary, either true 1 or 0 

In [None]:
heart_disease_df.head() # show newly binarised target feature has been applied successfully

In [None]:
heart_disease_df.info() # output number of columns, along with their names, data type and how many non-null values they contain

In [None]:
heart_disease_df = heart_disease_df.drop(columns=["id"]) # drop index id column as it does not need to be represented 

heart_disease_df.head() # show dataset with the dropped id column, notying we still have an index

In [None]:
categorical_cols = heart_disease_df.select_dtypes(include=['object']).columns.tolist() # find categorical columns in the dataset by filtering for object data types

categorical_cols # output textually the categorical columns found above

In [None]:
heart_disease_df = pd.get_dummies(heart_disease_df, columns=categorical_cols, drop_first=True) # one hot encode the categorical columns found above

In [None]:
heart_disease_df.isnull().sum() # search for missing values in the heart disease dataset

In [None]:
missing_threshold = 0.5 # if over half the values in a column are missing, drop the column

for col in heart_disease_df.columns: # iterate through the heart disease  columns
    missing_fraction = heart_disease_df[col].isnull().mean() # compute the fraction of missing values in the column

    if missing_fraction > missing_threshold: # if the missing fraction is greater than the threshold
        heart_disease_df.drop(columns=[col], inplace=True) # drop the column

    else: # if the missing fraction is less than the threshold
        if heart_disease_df[col].dtype in [np.float64, np.int64]: # and if the column is numerical
            heart_disease_df[col] = heart_disease_df[col].fillna(heart_disease_df[col].median()) # fill missing values with the median value of the column

        else: # if the column is categorical
            heart_disease_df[col] = heart_disease_df[col].fillna(heart_disease_df[col].mode()[0]) # fill missing values with the mode value of the column

heart_disease_df.isnull().sum() # now verify that there are no missing values in the heart disease dataset

heart_disease_df.head() # output the cleaned heart disease dataset with no remaining missing values

In [None]:
numerical_cols = heart_disease_df.select_dtypes(include=[np.float64, np.int64]).columns.tolist() # get numerical columns by filtering for float64 and int64 data types

numerical_cols.remove("num") # remove the target variable column from the list of numerical columns

scaler = StandardScaler() # apply standard scaling to the numerical columns to bring them to a common scale

heart_disease_df[numerical_cols] = scaler.fit_transform(heart_disease_df[numerical_cols]) # fit and transform the numerical columns using the standard scaler

heart_disease_df.head() # output the heart disease dataset with scaled numerical features

In [None]:
plt.figure(figsize=(12, 10)) # setup a plot

correlation_matrix = heart_disease_df.corr() # get correlation matrix of the heart disease dataset

sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True) # heatmap plot the correlation matrix

plt.title('Correlation Heatmap of Heart Disease Features', fontsize=16) 

plt.show() # output heatmap plot of the correlation matrix

In [None]:
X = heart_disease_df.drop("num", axis=1) # get the features of heart disease by dropping the target variable column

y = heart_disease_df["num"] # get the target variable column of heart disease

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=65) # split the heart disease dataset into training and testing sets with an 80-20 split

### **XGBoost**

In [None]:
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=65) # setup the xgboost classifier model

xgb_clf.fit(X_train, y_train) # train the xgboost classifier model on the training data of the heart disease dataset

In [None]:
y_pred_xgb = xgb_clf.predict(X_test) # make predictions

print(confusion_matrix(y_test, y_pred_xgb)) # output confusion matrix of the xgboost model

print(classification_report(y_test, y_pred_xgb)) # output classification report of the xgboost model

print('ROC AUC:', roc_auc_score(y_test, y_pred_xgb)) # output ROC AUC score of the xgboost model

In [None]:
# XGBoost hyperparameter for grid search
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5]
}

In [None]:
xgb_grid = GridSearchCV(xgb.XGBClassifier(random_state=65),
                      xgb_param_grid,
                      cv=10,
                      scoring='roc_auc',
                      n_jobs=-1,
                      verbose=1) # setup the xgboost grid search with 10 fold cross validation to find best hyperparameters from the parameter grid above

xgb_grid.fit(X_train, y_train) # perform the grid search on the training data

print('Best XGBoost Params:', xgb_grid.best_params_) # report back the best hyperparameters found from the grid search
print('Best XGBoost CV ROC AUC:', xgb_grid.best_score_) # report back the best cross-validated ROC AUC score from the grid search

## **LightGBM Non-Distrubuted**

In [None]:
lgb_clf = lgb.LGBMClassifier(random_state=65) # setup the lightgbm classifier model

lgb_clf.fit(X_train, y_train) # train the lightgbm classifier model on the training data of the heart disease dataset

In [None]:
y_pred_lgb = lgb_clf.predict(X_test) # make predictions

print(confusion_matrix(y_test, y_pred_lgb)) # output confusion matrix of the lightgbm model all using the testing data split

print(classification_report(y_test, y_pred_lgb)) # output the classification report of the lightgbm model

print('ROC AUC:', roc_auc_score(y_test, y_pred_lgb)) # output ROC AUC score of the lightgbm model

In [None]:
# LightGBM hyperparameter grid search
lgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [-1, 5, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
    'min_child_samples': [5, 10, 20]
}

In [None]:
lgb_grid = RandomizedSearchCV(
    lgb.LGBMClassifier(random_state=65),
    lgb_param_grid,
    n_iter=20,
    cv=10,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1,
    random_state=65
) # use randomized search cv to find best hyperparameters for lightgbm model for 10 fold cross validation

lgb_grid.fit(X_train, y_train) # perform the randomized search on the training data

print('Best LightGBM Params:', lgb_grid.best_params_)  # output the best hyperparameters found for lightgbm model
print('Best LightGBM CV ROC AUC:', lgb_grid.best_score_) # output the best cross-validated ROC AUC score from the randomized search

## **LightGBM Distrubuted**

In [None]:
# LightGBM with distributed learning using Dask

# setup a dask client for applying distributed learning to the lightgbm model, with 2 workers, hyperthreading and 2GB memory limit per worker
client = Client(n_workers=2, threads_per_worker=2, memory_limit='2GB')

print(client) # output parameters of the dask client

print(f"Dashboard link: {client.dashboard_link}") # output the dashboard link for the dask client

In [None]:
# convert training and testing data to dask arrays with appropriate chunk sizes for distributed learning using dask
X_train_dask = da.from_array(X_train.values, chunks=(len(X_train)//2, X_train.shape[1])) # convert X training set to dask array with 2 partitions
y_train_dask = da.from_array(y_train.values, chunks=len(y_train)//2) # convert y training set to dask array with 2 partitions
X_test_dask = da.from_array(X_test.values, chunks=(len(X_test)//2, X_test.shape[1])) # convert X testing set to dask array with 2 partitions

print(f"Training data partitions: {X_train_dask.npartitions}") # output number of partitions in the dask array
print(f"Training data shape: {X_train_dask.shape}") # output shape of the dask array

In [None]:
# Train LightGBM with distributed learning
lgb_dist_clf = lgb.LGBMClassifier(
    random_state=65,
    n_jobs=-1  # utilize all available CPU cores
)

lgb_dist_clf.fit(X_train, y_train) # fit on the heart disease training data

y_pred_lgb_dist = lgb_dist_clf.predict(X_test) # make predictions using distrubted LightGBM model

print("LightGBM Distributed Learning Results:") # header for results

print(confusion_matrix(y_test, y_pred_lgb_dist))  # output confusion matrix of the distributed lightgbm model

print(classification_report(y_test, y_pred_lgb_dist)) # output classification report of the distributed lightgbm model

print('ROC AUC:', roc_auc_score(y_test, y_pred_lgb_dist)) # output ROC AUC score of the distributed lightgbm model

In [None]:
lgb_dist_grid = DaskGridSearchCV(
    lgb.LGBMClassifier(random_state=65, n_jobs=1),
    lgb_param_grid,
    cv=10,
    scoring='roc_auc'
) # setup dask grid search cv for distributed learning to find best hyperparameters for lightgbm model

lgb_dist_grid.fit(X_train, y_train) # perform the grid search on the training data

print('Best LightGBM Distributed Params:', lgb_dist_grid.best_params_) # output the best hyperparameters found for distributed lightgbm model

print('Best LightGBM Distributed CV ROC AUC:', lgb_dist_grid.best_score_) # output the best cross-validated ROC AUC score from the distributed grid search

In [None]:
client.close() # close the dask client as we have found best hyperparameters and distributed learning is done

### **SHAP**

In [None]:
# SHAP explanation for XGBoost model

explainer_xgb = shap.TreeExplainer(xgb_clf) # setup SHAP tree explainer for the xgboost model
shap_values_xgb = explainer_xgb.shap_values(X_test.values) # compute SHAP values for the xgboost model using the testing data of the heart disease dataset

shap.summary_plot(shap_values_xgb, X_test, show=False) # summary plot the calculated SHAP values for the xgboost model

plt.title('SHAP Summary Plot for XGBoost')

plt.show() # output SHAP summary plot for the xgboost model

In [None]:
# SHAP explanation for non-distributed LightGBM
    
explainer_lgb = shap.TreeExplainer(lgb_clf) # setup SHAP tree explainer for the non-distributed lightgbm model
shap_values_lgb = explainer_lgb.shap_values(X_test.values) # compute SHAP values for the non-distributed lightgbm model using the testing data of the heart disease dataset

shap.summary_plot(shap_values_lgb, X_test, show=False) # summary plot the calculated SHAP values for the non-distributed lightgbm model

plt.title('SHAP Summary Plot for LightGBM')

plt.show() # output SHAP summary plot for the non-distributed lightgbm model

In [None]:
# SHAP explanation for LightGBM with Dask Distributed Learning
    
explainer_lgb_dist = shap.TreeExplainer(lgb_dist_clf) # setup SHAP tree explainer for the lightgbm model trained with dask distributed learning
shap_values_lgb_dist = explainer_lgb_dist.shap_values(X_test.values) # compute SHAP values for the lightgbm model trained with dask distributed learning using the testing data of the heart disease dataset

shap.summary_plot(shap_values_lgb_dist, X_test, show=False) # plot the calculated SHAP values for the lightgbm model trained with dask distributed learning

plt.title('SHAP Summary Plot for LightGBM with Dask')

plt.show() # output SHAP summary plot for the lightgbm model trained with dask distributed learning