# Kidney Disease Prediction

In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
import random
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [42]:
df=pd.read_csv("kidney_disease.csv")


In [43]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [44]:
df.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [45]:
df.describe()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,400.0,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,0.0,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,99.75,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,299.25,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [46]:
df[['htn','dm','cad','pe','ane']] 

Unnamed: 0,htn,dm,cad,pe,ane
0,yes,yes,no,no,no
1,no,no,no,no,no
2,no,yes,no,no,yes
3,yes,no,no,yes,yes
4,no,no,no,no,no
...,...,...,...,...,...
395,no,no,no,no,no
396,no,no,no,no,no
397,no,no,no,no,no
398,no,no,no,no,no


In [47]:
df[['htn','dm','cad','pe','ane']]  = df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})

In [48]:
df[['htn','dm','cad','pe','ane']] 

Unnamed: 0,htn,dm,cad,pe,ane
0,1.0,1,0,0.0,0.0
1,0.0,0,0,0.0,0.0
2,0.0,1,0,0.0,1.0
3,1.0,0,0,1.0,1.0
4,0.0,0,0,0.0,0.0
...,...,...,...,...,...
395,0.0,0,0,0.0,0.0
396,0.0,0,0,0.0,0.0
397,0.0,0,0,0.0,0.0
398,0.0,0,0,0.0,0.0


In [49]:
df[['rbc','pc']] = df[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
df[['pcc','ba']] = df[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
df[['appet']] = df[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})

In [50]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,0.0,0.0,0.0,...,44,7800,5.2,1.0,1,0,1.0,0.0,0.0,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,0.0,0.0,0.0,...,38,6000,,0.0,0,0,1.0,0.0,0.0,ckd
2,2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,...,31,7500,,0.0,1,0,0.0,0.0,1.0,ckd
3,3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,...,32,6700,3.9,1.0,0,0,0.0,1.0,1.0,ckd
4,4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,...,35,7300,4.6,0.0,0,0,1.0,0.0,0.0,ckd


In [51]:
df['classification']

0         ckd
1         ckd
2         ckd
3         ckd
4         ckd
        ...  
395    notckd
396    notckd
397    notckd
398    notckd
399    notckd
Name: classification, Length: 400, dtype: object

In [52]:
df['classification'] = df['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})

In [53]:
df['classification']

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
395    0.0
396    0.0
397    0.0
398    0.0
399    0.0
Name: classification, Length: 400, dtype: float64

In [54]:
df.rename(columns={'classification':'class'},inplace=True)

In [55]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,0,48.0,80.0,1.02,1.0,0.0,,0.0,0.0,0.0,...,44,7800,5.2,1.0,1,0,1.0,0.0,0.0,1.0
1,1,7.0,50.0,1.02,4.0,0.0,,0.0,0.0,0.0,...,38,6000,,0.0,0,0,1.0,0.0,0.0,1.0
2,2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,...,31,7500,,0.0,1,0,0.0,0.0,1.0,1.0
3,3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,...,32,6700,3.9,1.0,0,0,0.0,1.0,1.0,1.0
4,4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,...,35,7300,4.6,0.0,0,0,1.0,0.0,0.0,1.0


In [56]:
df

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,0,48.0,80.0,1.020,1.0,0.0,,0.0,0.0,0.0,...,44,7800,5.2,1.0,1,0,1.0,0.0,0.0,1.0
1,1,7.0,50.0,1.020,4.0,0.0,,0.0,0.0,0.0,...,38,6000,,0.0,0,0,1.0,0.0,0.0,1.0
2,2,62.0,80.0,1.010,2.0,3.0,0.0,0.0,0.0,0.0,...,31,7500,,0.0,1,0,0.0,0.0,1.0,1.0
3,3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,...,32,6700,3.9,1.0,0,0,0.0,1.0,1.0,1.0
4,4,51.0,80.0,1.010,2.0,0.0,0.0,0.0,0.0,0.0,...,35,7300,4.6,0.0,0,0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,...,47,6700,4.9,0.0,0,0,1.0,0.0,0.0,0.0
396,396,42.0,70.0,1.025,0.0,0.0,0.0,0.0,0.0,0.0,...,54,7800,6.2,0.0,0,0,1.0,0.0,0.0,0.0
397,397,12.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,...,49,6600,5.4,0.0,0,0,1.0,0.0,0.0,0.0
398,398,17.0,60.0,1.025,0.0,0.0,0.0,0.0,0.0,0.0,...,51,7200,5.9,0.0,0,0,1.0,0.0,0.0,0.0


In [57]:
# Further cleaning
df['pe'] = df['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
df['appet'] = df['appet'].replace(to_replace='no',value=0)
df['cad'] = df['cad'].replace(to_replace='\tno',value=0)
df['dm'] = df['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
df.drop('id',axis=1,inplace=True)

In [58]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,0.0,0.0,0.0,121.0,...,44,7800,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,7.0,50.0,1.02,4.0,0.0,,0.0,0.0,0.0,,...,38,6000,,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,423.0,...,31,7500,,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,...,32,6700,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1.0
4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,106.0,...,35,7300,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [59]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,0.0,0.0,0.0,121.0,...,44,7800,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,7.0,50.0,1.02,4.0,0.0,,0.0,0.0,0.0,,...,38,6000,,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,423.0,...,31,7500,,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,...,32,6700,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1.0
4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,106.0,...,35,7300,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [60]:
df2 = df.dropna(axis=0)
df2['class'].value_counts()

0.0    115
1.0     43
Name: class, dtype: int64

In [61]:
df=df.drop(["su","rbc","rc","wc","pot","sod"],axis=1)
df["pcv"]=df["pcv"].fillna(method="ffill")
df.drop(["pc"],axis=1,inplace=True)
df["hemo"]=df["hemo"].fillna(method="ffill")
df.drop(["sg"],axis=1,inplace=True)
df=df.fillna(method="ffill")
df.drop(["ba"],axis=1,inplace=True)
df.drop(["pe"],axis=1,inplace=True)
df.drop(["cad"],axis=1,inplace=True)
df.drop(["ane"],axis=1,inplace=True)

In [62]:
df=df.replace("\t?",31)
print(df.columns)
print(df.shape[1])

Index(['age', 'bp', 'al', 'pcc', 'bgr', 'bu', 'sc', 'hemo', 'pcv', 'htn', 'dm',
       'appet', 'class'],
      dtype='object')
13


In [63]:
X = df.drop(["class"],axis=1)
y = df["class"]

In [64]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1)

In [65]:
X_train.shape

(360, 12)

In [66]:
y_train.shape

(360,)

In [67]:
# sm=SMOTE()
# X_train, y_train =sm.fit_sample(X_train,y_train)

In [68]:
## Put the models in dictonary 

models = {"Logistic Regression": LogisticRegression(),
         "Random Forest": RandomForestClassifier()}

In [69]:
## Create a function to fit and score models 

def fit_and_score(models, X_train, X_test, y_train, y_test):
    
    # Random Seed
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores = {}
    # Loop through Models 
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [70]:
model_scores = fit_and_score(models = models,
                             X_train = X_train,
                             X_test = X_test,
                             y_train = y_train,
                             y_test = y_test)
model_scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'Logistic Regression': 0.925, 'Random Forest': 1.0}

In [71]:
from sklearn.model_selection import GridSearchCV
# Different hyperparameters for our LogisticRegression model
log_reg_grid = {"C": np.logspace(-4, 4, 30),
                "solver": ["liblinear"]}

# Setup grid hyperparameter search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid,
                          cv=5,
                          verbose=True)

# Fit grid hyperparameter search model
gs_log_reg.fit(X_train, y_train);

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    0.5s finished


In [72]:
# Check the best hyperparmaters
gs_log_reg.best_params_

{'C': 4.893900918477489, 'solver': 'liblinear'}

In [73]:
# Evaluate the grid search LogisticRegression model
gs_log_reg.score(X_test, y_test)

0.925

In [74]:
clf = LogisticRegression(C=2.592943797404667, solver='liblinear')
clf.fit(X_train, y_train)

LogisticRegression(C=2.592943797404667, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [75]:
# Make predictions with tuned model
y_preds = clf.predict(X_test)

In [76]:
y_preds

array([1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 0., 0., 1., 0., 1.])

In [77]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_preds))

0.925


In [78]:
from sklearn.externals import joblib

joblib.dump(clf,"kidney_model")

['kidney_model']