In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load dataset
dataset=pd.read_csv("CKD.csv")
dataset

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [3]:
# One-hot encode categorical variables
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
dataset

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,1,0,0,0,0,0,1,0,0,1
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,1,0,0,0,0,0,1,0,0,1
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,1,0,0,0,0,0,1,0,1,1
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,1,0,0,0,0,0,1,0,0,1
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,1,0,0,1,1,0,1,0,1,1
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,1,0,0,1,1,0,0,0,0,1
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,1,0,0,1,1,0,1,0,1,1


In [4]:
dataset["classification_yes"].value_counts()

classification_yes
1    249
0    150
Name: count, dtype: int64

In [5]:
# Independent and dependent variables
independent=dataset[["age","bp","al","su","sc","sod","pot","hrmo","pc_normal","pcc_present","ba_present","htn_yes","dm_yes","cad_yes","appet_yes","pe_yes","ane_yes"]]
dependent=dataset[["classification_yes"]]

In [6]:
independent.shape

(399, 17)

In [7]:
dependent

Unnamed: 0,classification_yes
0,1
1,1
2,1
3,1
4,1
...,...
394,1
395,1
396,1
397,1


In [8]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(independent,dependent,test_size=1/3,random_state=0)

In [9]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [10]:
# Random Forest and Grid Search
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Grid Search setup
grid = GridSearchCV(RandomForestClassifier(random_state=0), param_grid, refit=True,
                    verbose=3, n_jobs=-1, scoring='f1_weighted')

# Fit the model
grid.fit(X_train, Y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


  return fit_method(estimator, *args, **kwargs)


In [11]:
# Extract results and predictions
result = grid.cv_results_
grid_predictions = grid.predict(X_test)

In [12]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, grid_predictions)

In [13]:
# Classification report
from sklearn.metrics import classification_report
clf_report = classification_report(Y_test, grid_predictions)

In [14]:
# F1 Score (weighted)
from sklearn.metrics import f1_score
f1_macro = f1_score(Y_test, grid_predictions, average='weighted')
print("The f1_macro value for best parameter {}:".format(grid.best_params_), f1_macro)

The f1_macro value for best parameter {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}: 0.9924946382275899


In [17]:
print("Confusion Matrix:\n",cm)

Confusion Matrix:
 [[51  0]
 [ 1 81]]


In [18]:
print("The classification report:\n",clf_report)

The classification report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        51
           1       1.00      0.99      0.99        82

    accuracy                           0.99       133
   macro avg       0.99      0.99      0.99       133
weighted avg       0.99      0.99      0.99       133



In [15]:
# ROC AUC Score
from sklearn.metrics import roc_auc_score
roc_score = roc_auc_score(Y_test, grid.predict_proba(X_test)[:, 1])
print("ROC AUC Score:", roc_score)

ROC AUC Score: 1.0


In [16]:
#Convert results to DataFrame
table = pd.DataFrame.from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.196189,0.010839,0.022249,0.001395,True,,1,2,50,"{'bootstrap': True, 'max_depth': None, 'min_sa...",0.981569,0.961755,0.944023,0.981031,1.000000,0.973676,0.019135,18
1,0.353075,0.009937,0.024293,0.002512,True,,1,2,100,"{'bootstrap': True, 'max_depth': None, 'min_sa...",0.981569,0.942166,0.944023,1.000000,1.000000,0.973552,0.025769,21
2,0.478721,0.016875,0.023633,0.002880,True,,1,2,150,"{'bootstrap': True, 'max_depth': None, 'min_sa...",0.981569,0.961755,0.962573,0.981031,1.000000,0.977386,0.014184,7
3,0.155588,0.003220,0.017846,0.000595,True,,1,5,50,"{'bootstrap': True, 'max_depth': None, 'min_sa...",0.981569,0.942166,0.962573,0.981031,0.942332,0.961934,0.017468,52
4,0.296704,0.004997,0.019673,0.001343,True,,1,5,100,"{'bootstrap': True, 'max_depth': None, 'min_sa...",0.981569,0.942166,0.962573,0.981031,0.981031,0.969674,0.015534,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.221983,0.004673,0.019272,0.000719,False,20,2,2,100,"{'bootstrap': False, 'max_depth': 20, 'min_sam...",0.981569,0.942166,0.962573,0.981031,0.981031,0.969674,0.015534,34
68,0.343826,0.014334,0.024371,0.001062,False,20,2,2,150,"{'bootstrap': False, 'max_depth': 20, 'min_sam...",0.981569,0.942166,0.962573,0.981031,0.981031,0.969674,0.015534,34
69,0.121230,0.004786,0.016190,0.001024,False,20,2,5,50,"{'bootstrap': False, 'max_depth': 20, 'min_sam...",0.981569,0.942166,0.962573,0.981031,0.961826,0.965833,0.014597,44
70,0.220903,0.009637,0.021040,0.002969,False,20,2,5,100,"{'bootstrap': False, 'max_depth': 20, 'min_sam...",1.000000,0.942166,0.962573,0.981031,0.981031,0.973360,0.019580,28
