In [75]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder , StandardScaler
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV



In [76]:
df=pd.read_csv('kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [77]:

df.isnull().sum().sort_values(ascending=True)

id                  0
classification      0
pe                  1
appet               1
ane                 1
cad                 2
dm                  2
htn                 2
pcc                 4
ba                  4
age                 9
bp                 12
sc                 17
bu                 19
bgr                44
al                 46
sg                 47
su                 49
hemo               52
pc                 65
pcv                70
sod                87
pot                88
wc                105
rc                130
rbc               152
dtype: int64

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [79]:
df.describe()


Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,400.0,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,0.0,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,99.75,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,299.25,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [87]:
col={'age': 'age',
     'bp': 'blood_pressure',
     'sg': 'specific_gravity',
     'al': 'albumin',
     'su': 'sugar',
     'rbc': 'red_blood_cells',
     'pc': 'pus_cell',
     'pcc': 'pus_cell_clumps',
     'ba': 'bacteria',
     'bgr': 'blood_glucose_random',
     'bu': 'blood_urea',
     'sc': 'serum_creatinine',
     'sod': 'sodium',
     'pot': 'potassium',
     'hemo': 'hemoglobin',
     'pcv': 'packed_cell_volume',
     'wc': 'white_blood_cell_count',
     'rc': 'red_blood_cell_count',
     'htn': 'hypertension',
     'dm': 'diabetes_mellitus',
     'cad': 'coronary_artery_disease',
     'appet': 'appetite',
     'pe': 'pedal_edema',
     'ane': 'anemia',
     'classification': 'class'}
df.rename(columns=col, inplace=True)
df.head()

Unnamed: 0,id,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,pedal_edema,anemia,class
3,3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,...,11,42,14,1,0,0,1,1,1,0
9,9,53.0,90.0,1.02,2.0,0.0,0,0,1,0,...,8,11,12,1,1,0,1,0,1,0
11,11,63.0,70.0,1.01,3.0,0.0,0,0,1,0,...,11,25,13,1,1,0,1,1,0,0
14,14,68.0,80.0,1.01,3.0,2.0,1,0,1,1,...,0,8,2,1,1,1,1,1,0,0
20,20,61.0,80.0,1.015,2.0,0.0,0,0,0,0,...,4,63,7,1,1,1,1,1,1,0


In [81]:
df=df.replace('?',np.nan)
df=df.dropna()

In [82]:
le=LabelEncoder()
for column in df.columns :
    if df[column].dtype =='object':
        df[column]=le.fit_transform(df[column])
X = df.drop('class',axis=1)
y=df['class']
scaler =StandardScaler()
X_scaled=scaler.fit_transform(X)

In [83]:
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_scaled, y)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

In [84]:
cv_score_xgb=cross_val_score(xgb_model,X_scaled,y,cv=5)
print(f"XGBoost CVA acurracy:{cv_score_xgb.mean()*100:.2f}%")

XGBoost CVA acurracy:85.62%


In [85]:
xgb_param_grid = {
    'n_estimators':[50, 100, 150],
    'learning_rate':[0.01, 0.1, 0.2],
    'max_depth':[3, 6, 9]
}
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(X_smote, y_smote)
print("Bestpara XGBoost:",xgb_grid_search.best_params_)
print(f"Best CVA XGBoost: {xgb_grid_search.best_score_ * 100:.2f}%")


Bestpara XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Best CVA XGBoost: 90.00%


In [86]:
import joblib
joblib.dump(xgb_model,'Xgb_ckd_model.pkl')
joblib.dump(scaler, 'scaler.pkl')




['scaler.pkl']