1. Load the Required Libraries

In [2]:
import pandas as pd
import numpy as np
import dtale
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier as LightGradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score,recall_score

2. Read the data from the dataset

In [None]:
df = pd.read_csv("../../Datasets/diabetes_prediction_dataset.csv", encoding='latin1')
print(f"Dataset shape: {df.shape}")
print("\nData types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

# Visualize class distribution
plt.figure(figsize=(8,5))
sns.countplot(x='diabetes', data=df)
plt.title('Class Distribution')
plt.show()



3. Handling missing values and replacing missing values with nan from numpy and replace with mean of all the other values

In [None]:
print(df.isnull().sum())
print(df.isna().sum())
dtale.show(df)

4. Encoding categorical values

In [None]:
gender_mapping = {'Female': 0, 'Male': 1, 'Other': 2}
df['gender'] = df['gender'].map(gender_mapping)
dtale.show(df)

In [None]:
smoking_mapping = {'never': 0, 'not current': 1, 'current': 2, 
                  'No Info': 3, 'ever': 4, 'former': 5}
df['smoking_history'] = df['smoking_history'].map(smoking_mapping)
dtale.show(df)

5. Feature Engineering

In [None]:
# Add feature correlation analysis
plt.figure(figsize=(12,8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

6. Split the attribites into dependent and independent attributes

In [None]:
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]
dtale.show(Y, ignore_duplicate=True)



7. Splitting the dataset intro training set and test set

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

8. Train the Light Gradient Model

In [None]:
lgb = LightGradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
lgb.fit(X_train, Y_train)
Kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(lgb, X_train, Y_train, cv=Kfold, scoring='accuracy')
Y_pred = lgb.predict(X_test)
# Display evaluation matrix (classification report)
report_train = classification_report(Y_test, Y_train, output_dict=True)
df_report_train = pd.DataFrame(report_train).transpose()
print(df_report_train)

[LightGBM] [Info] Number of positive: 28135, number of negative: 174809
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 202944, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138634 -> initscore=-1.826680
[LightGBM] [Info] Start training from score -1.826680
[LightGBM] [Info] Number of positive: 25285, number of negative: 157364
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 182649, number of used features: 11
[LightGBM] [Info


X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 25412, number of negative: 157237
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006380 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154
[LightGBM] [Info] Number of data points in the train set: 182649, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.139130 -> initscore=-1.822533
[LightGBM] [Info] Start training from score -1.822533
[LightGBM] [Info] Number of positive: 25355, number of negative: 157294



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154
[LightGBM] [Info] Number of data points in the train set: 182649, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138818 -> initscore=-1.825141
[LightGBM] [Info] Start training from score -1.825141



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 25345, number of negative: 157304
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005957 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 182649, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138763 -> initscore=-1.825599
[LightGBM] [Info] Start training from score -1.825599



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 25324, number of negative: 157326
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 182650, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138648 -> initscore=-1.826568
[LightGBM] [Info] Start training from score -1.826568
[LightGBM] [Info] Number of positive: 25363, number of negative: 157287
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 182650, number of used features: 11
[LightGBM] [Info


X does not have valid feature names, but LGBMClassifier was fitted with feature names






X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 25322, number of negative: 157328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006446 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 182650, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138637 -> initscore=-1.826659
[LightGBM] [Info] Start training from score -1.826659
[LightGBM] [Info] Number of positive: 25255, number of negative: 157395



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154
[LightGBM] [Info] Number of data points in the train set: 182650, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138270 -> initscore=-1.829734
[LightGBM] [Info] Start training from score -1.829734



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 25311, number of negative: 157339
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 182650, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138577 -> initscore=-1.827164
[LightGBM] [Info] Start training from score -1.827164



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 25243, number of negative: 157407
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 182650, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138204 -> initscore=-1.830286
[LightGBM] [Info] Start training from score -1.830286



X does not have valid feature names, but LGBMClassifier was fitted with feature names


X does not have valid feature names, but LGBMClassifier was fitted with feature names



In [14]:
rf2 = LightGradientBoostingClassifier(
    
    n_estimators=1000,
    criterion= 'entropy',
    min_samples_split= 10,
    random_state=42)

8. Retraining the model using Resampled data

In [15]:
smote_enn = SMOTE(sampling_strategy='minority', random_state=42)
X_train_res, Y_train_res = smote_enn.fit_resample(X_train, Y_train)
print(pd.Series(Y_train_res).value_counts())

0.0    174809
1.0    174809
Name: count, dtype: int64


In [None]:
model = LightGradientBoostingClassifier( **rf2)
model.fit(X_train_res, Y_train_res) 
Kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_res, Y_train_res, cv=Kfold, scoring='accuracy')

[LightGBM] [Info] Number of positive: 174809, number of negative: 174809
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1439
[LightGBM] [Info] Number of data points in the train set: 349618, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 157126, number of negative: 157530
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1444
[LightGBM] [Info] Number of data points in the train set: 314656, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499358 -> initscore=-0.002568
[LightGBM] [Info] Start training from score -0.002568



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 157501, number of negative: 157155
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1440
[LightGBM] [Info] Number of data points in the train set: 314656, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500550 -> initscore=0.002199
[LightGBM] [Info] Start training from score 0.002199



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 157316, number of negative: 157340
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1442
[LightGBM] [Info] Number of data points in the train set: 314656, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499962 -> initscore=-0.000153
[LightGBM] [Info] Start training from score -0.000153



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 157287, number of negative: 157369
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007477 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1443
[LightGBM] [Info] Number of data points in the train set: 314656, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499870 -> initscore=-0.000521
[LightGBM] [Info] Start training from score -0.000521



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 157151, number of negative: 157505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1436
[LightGBM] [Info] Number of data points in the train set: 314656, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499437 -> initscore=-0.002250
[LightGBM] [Info] Start training from score -0.002250



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 157313, number of negative: 157343
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1440
[LightGBM] [Info] Number of data points in the train set: 314656, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499952 -> initscore=-0.000191
[LightGBM] [Info] Start training from score -0.000191



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 157465, number of negative: 157191
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1522
[LightGBM] [Info] Number of data points in the train set: 314656, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500435 -> initscore=0.001742
[LightGBM] [Info] Start training from score 0.001742



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 157398, number of negative: 157258
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007401 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1436
[LightGBM] [Info] Number of data points in the train set: 314656, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500222 -> initscore=0.000890
[LightGBM] [Info] Start training from score 0.000890



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 157330, number of negative: 157327
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010003 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1440
[LightGBM] [Info] Number of data points in the train set: 314657, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500005 -> initscore=0.000019
[LightGBM] [Info] Start training from score 0.000019



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[LightGBM] [Info] Number of positive: 157394, number of negative: 157263
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1445
[LightGBM] [Info] Number of data points in the train set: 314657, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500208 -> initscore=0.000833
[LightGBM] [Info] Start training from score 0.000833



X does not have valid feature names, but LGBMClassifier was fitted with feature names



In [None]:
importances = model.feature_importances_
features = df.columns[:-1]
plt.barh(features, importances)
plt.show()

10. Evaluate the retrained model

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
sensitivity = recall_score(Y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(Y_test, y_proba)
cm = confusion_matrix(Y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)


report = classification_report(Y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()

pd.set_option("display.precision", 4)
print(df_report)
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(Y_test, y_pred)}")
print(f"Mean Accuracy:{scores.mean():.4f} (+/- {scores.std():.4f})")# Plot confusion matrix
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
    
    # Plot ROC curve
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(model, X_test, Y_test)
plt.title('ROC Curve')
plt.show()


X does not have valid feature names, but LGBMClassifier was fitted with feature names


X does not have valid feature names, but LGBMClassifier was fitted with feature names



              precision  recall  f1-score     support
0.0              0.8925  0.9430    0.9170  43525.0000
1.0              0.4773  0.3141    0.3789   7211.0000
accuracy         0.8536  0.8536    0.8536      0.8536
macro avg        0.6849  0.6286    0.6480  50736.0000
weighted avg     0.8335  0.8536    0.8406  50736.0000
ROC-AUC: 0.8179
Sensitivity: 0.3141
Specificity: 0.9430
Confusion Matrix:
[[41045  2480]
 [ 4946  2265]]
Mean Accuracy:0.8926 (+/- 0.0017)


ERROR	Thread(Thread-281 (process_request_thread)) dtale.utils:utils.py:handle_error()- Exception occurred while processing request: object of type 'NoneType' has no len()
 Traceback (most recent call last):
   File "C:\Users\Frank\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\dtale\views.py", line 120, in _handle_exceptions
    return func(*args, **kwargs)
   File "C:\Users\Frank\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\dtale\views.py", line 1595, in get_processes
    [_load_process(data_id) for data_id in global_state.keys()],
     ~~~~~~~~~~~~~^^^^^^^^^
   File "C:\Users\Frank\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\dtale\views.py", line 1580, in _load_process
    rows=len(data),
         ~~~^^^^^^
 TypeError: object of type 'NoneType' has no 

11.Feature Importance Visualization

In [None]:
# Feature importance analysis
importance = model.feature_importances_
features = df.columns[:-1]

# Create DataFrame and sort
feature_importance = pd.DataFrame({'Feature': features, 'Importance': importance})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

12. Apply Shapley Additive Technique to the data

In [None]:
#get the SHAP values
explainer = shap.Explainer(model)
shap_values = explainer(X_train_res)
print(shap_values.shape)

In [None]:
#Waterfall plot for the first observation
shap.plots.waterfall(shap_values[0])

In [None]:
shap.summary_plot(shap_values, X_train_res)