In [None]:
%pip install pandas pd numpy scikit-learn matplotlib seaborn kagglehub shap xgboost lightgbm

In [None]:
import numpy as np
import pandas as pd

# Data Viz
import matplotlib.pyplot as plt
import seaborn as sns

import kagglehub

# The Gradient Boosting Models + SHAP
import shap
import xgboost as xgb
import lightgbm as lgb

# Scikit-learn imports
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
# Getting the path to the kagglehub download
path = kagglehub.dataset_download("redwankarimsony/heart-disease-data")
print("Path to dataset files:", path)

In [None]:
# retrieving the data path to the csv
heart_data = path + "/heart_disease_uci.csv"

In [None]:
# convert to dataframe
heart_disease_df = pd.read_csv(heart_data)

In [None]:
# show the dataset has been created and retrieved successfully
heart_disease_df.head()

In [None]:
# get rows and columns of the heart disease dataframe
heart_disease_df.shape

In [None]:
heart_disease_df["num"] = heart_disease_df["num"].apply(lambda x: 1 if x > 0 else 0)

In [None]:
heart_disease_df.head()

In [None]:
heart_disease_df.info()

In [None]:
heart_disease_df = heart_disease_df.drop(columns=["id"])
heart_disease_df.head()

In [None]:
categorical_cols = heart_disease_df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    plt.figure(figsize=(8, 4))

    heart_disease_df[col].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
    
    plt.title(f'Distribution of {col}', fontsize=14)
    plt.xlabel(col, fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.grid(axis='y', alpha=0.5)

    plt.show()

In [None]:
categorical_cols = heart_disease_df.select_dtypes(include=['object']).columns.tolist()

categorical_cols

In [None]:
heart_disease_df = pd.get_dummies(heart_disease_df, columns=categorical_cols, drop_first=True)

In [None]:
heart_disease_df.isnull().sum()

In [None]:
missing_threshold = 0.5

for col in heart_disease_df.columns:
    missing_fraction = heart_disease_df[col].isnull().mean()

    if missing_fraction > missing_threshold:
        heart_disease_df.drop(columns=[col], inplace=True)

    else:
        if heart_disease_df[col].dtype in [np.float64, np.int64]:
            heart_disease_df[col] = heart_disease_df[col].fillna(heart_disease_df[col].median())
        
        else:
            heart_disease_df[col] = heart_disease_df[col].fillna(heart_disease_df[col].mode()[0])

heart_disease_df.isnull().sum()
heart_disease_df.head()

In [None]:
numerical_cols = heart_disease_df.select_dtypes(include=[np.float64, np.int64]).columns.tolist()

numerical_cols.remove("num")

scaler = StandardScaler()

heart_disease_df[numerical_cols] = scaler.fit_transform(heart_disease_df[numerical_cols])

heart_disease_df.head()

In [None]:
plt.figure(figsize=(12, 10))

correlation_matrix = heart_disease_df.corr()

sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)

plt.title('Correlation Heatmap of Heart Disease Features', fontsize=16)
plt.show()

In [None]:
X = heart_disease_df.drop("num", axis=1)
y = heart_disease_df["num"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80 20 spli

### **XGBoost**

In [None]:
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

xgb_clf.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgb_clf.predict(X_test)

print(confusion_matrix(y_test, y_pred_xgb))

print(classification_report(y_test, y_pred_xgb))

print('ROC AUC:', roc_auc_score(y_test, y_pred_xgb))

#### **XGBoost Hyperparameters Selection**


In [None]:
# XGBoost hyperparameter grid search
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

In [None]:
xgb_grid = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
                      xgb_param_grid,
                      cv=5,
                      scoring='roc_auc',
                      n_jobs=-1,
                      verbose=1)

xgb_grid.fit(X_train, y_train)

print('Best XGBoost Params:', xgb_grid.best_params_)
print('Best XGBoost CV ROC AUC:', xgb_grid.best_score_)

## **LightGBM**

In [None]:
lgb_clf = lgb.LGBMClassifier(random_state=42)

lgb_clf.fit(X_train, y_train)

In [None]:
y_pred_lgb = lgb_clf.predict(X_test)

print(confusion_matrix(y_test, y_pred_lgb))

print(classification_report(y_test, y_pred_lgb))

print('ROC AUC:', roc_auc_score(y_test, y_pred_lgb))

#### **LightGBM Hyperparameters Selection**


In [None]:
# LightGBM hyperparameter grid search
lgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

In [None]:
lgb_grid = GridSearchCV(lgb.LGBMClassifier(random_state=42),
                      lgb_param_grid,
                      cv=5,
                      scoring='roc_auc',
                      n_jobs=-1,
                      verbose=1)

lgb_grid.fit(X_train, y_train)

print('Best LightGBM Params:', lgb_grid.best_params_)
print('Best LightGBM CV ROC AUC:', lgb_grid.best_score_)

### **SHAP**

In [None]:
# SHAP explanation for XGBoost
explainer_xgb = shap.TreeExplainer(xgb_clf)
shap_values_xgb = explainer_xgb.shap_values(X_test.values)

shap.summary_plot(shap_values_xgb, X_test, show=False)

plt.title('SHAP Summary Plot for XGBoost')
plt.show()

In [None]:
# SHAP explanation for LightGBM
explainer_lgb = shap.TreeExplainer(lgb_clf)
shap_values_lgb = explainer_lgb.shap_values(X_test.values)

shap.summary_plot(shap_values_lgb, X_test, show=False)

plt.title('SHAP Summary Plot for LightGBM')
plt.show()

In [None]:
# Comparative Analysis: LightGBM (Tree-Based) vs Logistic Regression (Linear Model)
print("Comparing SHAP explainaination of the LightGBM vs a non-tree-based LR Model")

# Train Logistic Regression for comparison
log_reg_clf = LogisticRegression(max_iter=1000, random_state=42)
log_reg_clf.fit(X_train, y_train)

y_pred_log_reg = log_reg_clf.predict(X_test)

print("\nLogistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_log_reg):.4f}")

In [None]:
# SHAP explanations and handling for Logistic Regression
explainer_log_reg = shap.LinearExplainer(log_reg_clf, X_train)
shap_values_log_reg = explainer_log_reg.shap_values(X_test)

# Handle LightGBM SHAP values format (may be list of arrays for binary classification)
if isinstance(shap_values_lgb, list):
    shap_values_lgb_plot = shap_values_lgb[1]  # Use positive class

else:
    shap_values_lgb_plot = shap_values_lgb

# Robust handling for Logistic Regression SHAP values
shap_log_raw = shap_values_log_reg

if isinstance(shap_log_raw, list):
    shap_values_log_reg_plot = np.array(shap_log_raw[1] if len(shap_log_raw) > 1 else shap_log_raw[0])

else:
    shap_values_log_reg_plot = np.array(shap_log_raw)

# Normalize shape to (n_samples, n_features)
if shap_values_log_reg_plot.ndim == 1:

    if shap_values_log_reg_plot.shape[0] == X_test.shape[1]:
        shap_values_log_reg_plot = np.tile(shap_values_log_reg_plot, (X_test.shape[0], 1))

    else:
        shap_values_log_reg_plot = shap_values_log_reg_plot.reshape(-1, 1)

elif shap_values_log_reg_plot.ndim == 2 and shap_values_log_reg_plot.shape[1] == 1 and X_test.shape[1] > 1:
    shap_values_log_reg_plot = np.tile(shap_values_log_reg_plot, (1, X_test.shape[1]))

shap_values_log_reg_plot = shap_values_log_reg_plot.astype(float, copy=False)

In [None]:
# Side-by-side SHAP comparison
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

plt.sca(axes[0])
shap.summary_plot(shap_values_lgb_plot, X_test, show=False)

axes[0].set_title('LightGBM (Tree-Based)\nCaptures Non-Linear & Interaction Effects',fontsize=14, fontweight='bold', pad=20)

plt.sca(axes[1])
shap.summary_plot(shap_values_log_reg_plot, X_test, show=False)

axes[1].set_title('Logistic Regression (Linear)\nLinear Additive Effects Only',fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance comparison and key differences
lgb_importance = np.abs(shap_values_lgb_plot).mean(axis=0)
log_reg_importance = np.abs(shap_values_log_reg_plot).mean(axis=0)

comparison_df = pd.DataFrame({
    'Feature': X_test.columns,
    'LightGBM': lgb_importance,
    'LogReg': log_reg_importance
}).sort_values('LightGBM', ascending=False).head(15)

fig, ax = plt.subplots(figsize=(12, 8))
x_pos = np.arange(15)
width = 0.35

bars1 = ax.barh(x_pos - width/2, comparison_df['LightGBM'], width, label='LightGBM', color='#4ECDC4', alpha=0.8)
bars2 = ax.barh(x_pos + width/2, comparison_df['LogReg'], width, label='Logistic Regression', color='#FF6B6B', alpha=0.8)

ax.set_xlabel('Mean |SHAP Value|', fontweight='bold', fontsize=12)
ax.set_ylabel('Features', fontweight='bold', fontsize=12)
ax.set_title('Feature Importance: Tree-Based vs Linear Model', fontsize=14, fontweight='bold')
ax.set_yticks(x_pos)
ax.set_yticklabels(comparison_df['Feature'])
ax.legend()
ax.invert_yaxis()
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()