# kidney prediction - RandomForestClassfier

In [28]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
#import warnings
#warnings.filterwarnings("ignore", category=DeprecationWarning) 
from sklearn.preprocessing import StandardScaler
import warnings
import random
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
#from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pickle

df=pd.read_csv("kidney_disease.csv")

In [29]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [30]:
df.shape

(400, 26)

In [31]:
df['classification'].unique()

array(['ckd', 'ckd\t', 'notckd'], dtype=object)

In [32]:
df[['htn','dm','cad','pe','ane']] = df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})


In [33]:
df[['rbc','pc']] = df[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})


In [34]:
df.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [35]:
df[['pcc','ba']] = df[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})


In [36]:
df['appet'].fillna=0

In [37]:
df['appet'].unique()

array(['good', 'poor', nan], dtype=object)

In [38]:
df[['appet']] = df[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})


In [39]:
df['classification'] = df['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})


In [40]:
df.rename(columns={'classification':'class'},inplace=True)


In [41]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,0,48.0,80.0,1.02,1.0,0.0,,0.0,0.0,0.0,...,44,7800,5.2,1.0,1,0,1.0,0.0,0.0,1.0
1,1,7.0,50.0,1.02,4.0,0.0,,0.0,0.0,0.0,...,38,6000,,0.0,0,0,1.0,0.0,0.0,1.0
2,2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,...,31,7500,,0.0,1,0,0.0,0.0,1.0,1.0
3,3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,...,32,6700,3.9,1.0,0,0,0.0,1.0,1.0,1.0
4,4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,...,35,7300,4.6,0.0,0,0,1.0,0.0,0.0,1.0


In [42]:
df['dm'].unique()

array([1, 0, ' yes', '\tno', '\tyes', nan], dtype=object)

In [43]:
df['pe'] = df['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
df['appet'] = df['appet'].replace(to_replace='no',value=0)
df['cad'] = df['cad'].replace(to_replace='\tno',value=0)
df['dm'] = df['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
df.drop('id',axis=1,inplace=True)

In [44]:
df=df.drop(["su","rbc","rc","wc","pot","sod"],axis=1)
df["pcv"]=df["pcv"].fillna(method="ffill")
df.drop(["pc"],axis=1,inplace=True)
df["hemo"]=df["hemo"].fillna(method="ffill")
df.drop(["sg"],axis=1,inplace=True)
df=df.fillna(method="ffill")
#df.drop(["pcc"],axis=1,inplace=True)
df.drop(["ba"],axis=1,inplace=True)
df.drop(["pe"],axis=1,inplace=True)
df.drop(["cad"],axis=1,inplace=True)
df.drop(["ane"],axis=1,inplace=True)

In [45]:
df=df.replace("\t?",31)
print(df.columns)
print(df.shape[1])

Index(['age', 'bp', 'al', 'pcc', 'bgr', 'bu', 'sc', 'hemo', 'pcv', 'htn', 'dm',
       'appet', 'class'],
      dtype='object')
13


In [46]:
target=df["class"]
source=df.drop(["class"],axis=1)
X_train,X_test,y_train,y_test=train_test_split(source,target,test_size=0.05)


In [47]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
rf = RandomForestClassifier() 

In [49]:
from sklearn.model_selection import RandomizedSearchCV

In [50]:
parameters= { "n_estimators": [100,200,300,400,500,600,700,800,900,1000],
               "max_features" : ['auto','sqrt','log2'],
               "min_samples_split" :[2,4,6,8,10],
                'min_samples_leaf' : [1,2,3,4,5]}


In [51]:
df.head()

Unnamed: 0,age,bp,al,pcc,bgr,bu,sc,hemo,pcv,htn,dm,appet,class
0,48.0,80.0,1.0,0.0,121.0,36.0,1.2,15.4,44,1.0,1.0,1.0,1.0
1,7.0,50.0,4.0,0.0,121.0,18.0,0.8,11.3,38,0.0,0.0,1.0,1.0
2,62.0,80.0,2.0,0.0,423.0,53.0,1.8,9.6,31,0.0,1.0,0.0,1.0
3,48.0,70.0,4.0,1.0,117.0,56.0,3.8,11.2,32,1.0,0.0,0.0,1.0
4,51.0,80.0,2.0,0.0,106.0,26.0,1.4,11.6,35,0.0,0.0,1.0,1.0


In [52]:
df['htn'].unique()

array([1., 0.])

In [53]:
rs = RandomizedSearchCV(estimator = rf , param_distributions = parameters,n_iter =10 ,cv=5,verbose=2,n_jobs=1)

In [54]:
rs.fit(source , target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto, total=   2.0s
[CV] n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV]  n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto, total=   1.9s
[CV] n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto 
[CV]  n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto, total=   2.0s
[CV] n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto 
[CV]  n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto, total=   1.9s
[CV] n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto 
[CV]  n_estimators=900, min_samples_split=8, min_samples_leaf=1, max_features=auto, total=   2.1s
[CV] n_estimators=900, min_samples_split=10, min_samples_leaf=1, max_features=log2 
[CV]  n_estimators=900, min_samples_split=10, min_samples_leaf=1, max_features=log2, total=   1.9s
[CV] n_estimators=900, min_samples_split=10, min_samples_leaf=1, max_features=log2 
[CV]  n_estimators=900, min_samples_split=10, min_samples_leaf=1, max_features=log2, total= 

[CV]  n_estimators=300, min_samples_split=10, min_samples_leaf=5, max_features=log2, total=   0.6s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=5, max_features=log2 
[CV]  n_estimators=300, min_samples_split=10, min_samples_leaf=5, max_features=log2, total=   0.7s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=5, max_features=log2 
[CV]  n_estimators=300, min_samples_split=10, min_samples_leaf=5, max_features=log2, total=   0.6s


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=1,
                   param_distributions={'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [2, 4, 6, 8, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000]},
                   verbose=2)

In [39]:
RandomForestClassifier()

AttributeError: 'function' object has no attribute 'keys'

In [44]:
target

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
395    0.0
396    0.0
397    0.0
398    0.0
399    0.0
Name: class, Length: 400, dtype: float64

In [55]:



rs.best_params_

{'n_estimators': 900,
 'min_samples_split': 8,
 'min_samples_leaf': 1,
 'max_features': 'auto'}

In [51]:
from sklearn.metrics import r2_score

In [56]:
predict=rs.predict(X_test)

In [58]:
r2_score(predict,y_test)

1.0

In [56]:
file =open('kidney.pkl','wb')
pickle.dump(rs,file)