install necessary packages

In [None]:
!pip install xgboost shap scikit-learn pandas matplotlib seaborn

import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve)
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

Load and inspect data

In [None]:
# load dataset
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv')
print(df.head().to_string(index=False),'\n')

# quick info
df.info()
df.describe()

Clean and impute data

In [None]:
# replace 0s with NaN in invalid columns
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zero] = df[cols_with_zero].replace(0, np.nan)

# Show NaN count
df.isnull().sum()

# Impute missing values using KNN Imputer
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

Feature scalling

In [None]:
# Scale features except target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_imputed.drop('Outcome', axis=1))
X = pd.DataFrame(X_scaled, columns=df.columns[:-1])
y = df_imputed['Outcome']

Train-test-split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)


XGBoost Classifier

In [None]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

Risk Stratification Function

In [None]:
def stratify_risk(prob):
    if prob < 0.3:
        return "Low"
    elif prob < 0.7:
        return "Medium"
    else:
        return "High"


SHAP explainability

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

# Global feature importance
shap.plots.beeswarm(shap_values)

# SHAP waterfall for one prediction
shap.plots.waterfall(shap_values[0])

Evaluation Metrics and Visualization

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_proba))

# Confusion matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label='XGBoost')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()


risk output table

In [None]:
results = X_test.copy()
results['Probability'] = y_proba
results['Predicted'] = y_pred
results['Risk Level'] = results['Probability'].apply(stratify_risk)
results.head()