1. Load the Required Libraries

In [42]:
import pandas as pd
import numpy as np
import dtale
import os
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import category_encoders as ce
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer,SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score,recall_score

2. Read the data from the dataset

In [43]:
df = pd.read_csv("D:\\ResearchPapers\\Datasets\\diabetes_dataset.csv", encoding='latin1')
d=dtale.show(df)
d.open_browser()

In [44]:
df.head()
df.drop(columns=['year','location','race:AfricanAmerican','race:Asian','race:Caucasian','race:Hispanic','race:Other'], inplace=True)

3. Handling missing values and replacing missing values with nan from numpy and replace with mean of all the other values

In [45]:
print(df.isnull().sum())
print(df.isna().sum())
dtale.show(df)

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
hbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64
gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
hbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64




In [46]:
# Check unique values in categorical columns
print(df['smoking_history'].unique())
print(df['gender'].unique())    

['never' 'not current' 'current' 'No Info' 'ever' 'former']
['Female' 'Male' 'Other']


In [47]:
print(df['age'].describe())       # Check min/max age
print(df['bmi'].min())            # Check if BMI is 0 or negative

count    100000.000000
mean         41.885856
std          22.516840
min           0.080000
25%          24.000000
50%          43.000000
75%          60.000000
max          80.000000
Name: age, dtype: float64
10.01


4. Encoding the categorical data

In [48]:
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
dtale.show(df)



In [49]:
le = LabelEncoder()
df['smoking_history'] = le.fit_transform(df['smoking_history'].astype(str))
dtale.show(df)



4. Split the attribites into dependent and independent attributes

In [50]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values
dtale.show(Y, ignore_duplicate=True)



5. Splitting the dataset intro training set and test set

In [58]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

6. Train the Random Forest Model

In [59]:
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
rf.fit(X_train, Y_train)

Kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_train, Y_train, cv=Kfold, scoring='accuracy')

Y_pred = rf.predict(X_test)

In [60]:
rf.score(X_test, Y_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18294
           1       1.00      0.66      0.79      1706

    accuracy                           0.97     20000
   macro avg       0.98      0.83      0.89     20000
weighted avg       0.97      0.97      0.97     20000



In [61]:
rf2 = RandomForestClassifier(
    
     n_estimators=1000,
     criterion= 'entropy',
     min_samples_split= 10,
    random_state=42)

8. Retraining the model using Resampled data

In [62]:
smote_enn = SMOTE(sampling_strategy='minority', random_state=42)
X_train_res, Y_train_res = smote_enn.fit_resample(X_train, Y_train)
print(pd.Series(Y_train_res).value_counts())

0    73206
1    73206
Name: count, dtype: int64


In [63]:
model = RandomForestClassifier( random_state= 42)
model.fit(X_train_res, Y_train_res) 
Kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_res, Y_train_res, cv=Kfold, scoring='accuracy')

In [64]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
sensitivity = recall_score(Y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(Y_test, y_proba)

tn, fp, fn, tp = confusion_matrix(Y_test, y_pred).ravel()
specificity = tn / (tn + fp)

report = classification_report(Y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()

pd.set_option("display.precision", 4)
print(df_report)
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(Y_test, y_pred)}")
print(f"Mean Accuracy:{scores.mean():.4f} (+/- {scores.std():.4f})")

              precision  recall  f1-score     support
0                0.9730  0.9921    0.9825  18294.0000
1                0.8930  0.7046    0.7877   1706.0000
accuracy         0.9676  0.9676    0.9676      0.9676
macro avg        0.9330  0.8484    0.8851  20000.0000
weighted avg     0.9662  0.9676    0.9658  20000.0000
ROC-AUC: 0.9648
Sensitivity: 0.7046
Specificity: 0.9921
Confusion Matrix:
[[18150   144]
 [  504  1202]]
Mean Accuracy:0.9820 (+/- 0.0013)
