In [38]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import RandomizedSearchCV

In [2]:

df = pd.read_csv("kidney disease dataset.csv",
                 index_col=False,
                 na_values='?',
                 names=['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane','class'])


In [3]:
df.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [4]:
df.isnull().sum()

age        9
bp        12
sg        47
al        46
su        49
bgr      152
bu        65
sc         4
sod        4
pot       44
hemo      19
pcv       17
wc        87
rc        88
rbc       52
pc        71
pcc      106
ba       131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

In [5]:
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn.impute import SimpleImputer
from sklearn_pandas import CategoricalImputer
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [6]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [7]:
## number of missing data in each feature column
X.isnull().sum()

age        9
bp        12
sg        47
al        46
su        49
bgr      152
bu        65
sc         4
sod        4
pot       44
hemo      19
pcv       17
wc        87
rc        88
rbc       52
pc        71
pcc      106
ba       131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
dtype: int64

In [8]:
X.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo',
       'pcv', 'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane'],
      dtype='object')

In [9]:
## create a series of booleans
categorical_feature_mask = X.dtypes =='object'

In [12]:
non_categorical_feature_mask = ~categorical_feature_mask

In [13]:
## getting a list of categorical columns and numerical columns
cat_col_list = X.loc[:,categorical_feature_mask].columns.to_list()
num_col_list = X.loc[:,non_categorical_feature_mask].columns.to_list()

In [14]:
## Check the number of categories in each category
for i in cat_col_list:
    print(i,X[i].nunique())

bgr 2
bu 2
sc 2
sod 2
htn 2
dm 3
cad 2
appet 2
pe 2
ane 2


Note that dm contains 3 categories, lets check it out

In [15]:
X['dm'].value_counts()

no      261
yes     136
 yes      1
Name: dm, dtype: int64

There is one sample where the respond "yes" contains a spacebar in front of it. So we have to remove that spacebar

In [16]:
## replace the dm column with a new column that stripped all the whitespace
X['dm']=X['dm'].str.strip()

In [17]:
X['dm'].value_counts()

no     261
yes    137
Name: dm, dtype: int64

DataFrameMapper enables all the steps of data preprocessing to be grouped together and stored in a single object, and applied to any dataset with a single operation.

DataFrameMapper maps preprocessing tasks to each column of a given dataset via a list of tuples. Each tuple in the input list refers to a specific column of the dataframe. The first element in the tuple takes the name of the column, and the second element takes the preprocessing task or tasks that want to be applied to that particular column. If there is more than one task, the second element of the tuple needs to be a list, the order of which needs to match the desired order of operations.

In [18]:
# Apply numeric imputer

numeric_imputation_mapper = DataFrameMapper(
    [([numeric_feature],SimpleImputer(strategy='median')) for numeric_feature in num_col_list], ## must add [] to the name of the col. otherwise error occur
    input_df=True,
    df_out=True
)

In [19]:
## create categorical imputation mapper
categorical_imputation_mapper = DataFrameMapper(
    [(categorical_feature,[CategoricalImputer(),LabelBinarizer()]) for categorical_feature in cat_col_list], 
    input_df=True,
    df_out=True
)

In [20]:
categorical_imputation_mapper2 = DataFrameMapper(
    [('bgr',[CategoricalImputer(),LabelBinarizer()]), 
      ('bu',[CategoricalImputer(),LabelBinarizer()]),
    ('sc',[CategoricalImputer(),LabelBinarizer()]),
('sod',[CategoricalImputer(),LabelBinarizer()]),
('htn',[CategoricalImputer(),LabelBinarizer()]),
    # ('dm',[CategoricalImputer(),LabelBinarizer()]),
('cad',[CategoricalImputer(),LabelBinarizer()]),
('appet',[CategoricalImputer(),LabelBinarizer()]),
('pe',[CategoricalImputer(),LabelBinarizer()]),
('ane',[CategoricalImputer(),LabelBinarizer()])
    ],
    input_df=True,
    df_out=True
)

In [None]:
# col_transformer = ColumnTransformer(
#     [("ohe",OneHotEncoder(),categorical_feature_mask)],remainder='passthrough'
# )

: 

: 

In [21]:
X[num_col_list]

Unnamed: 0,age,bp,sg,al,su,pot,hemo,pcv,wc,rc,rbc,pc,pcc,ba
0,48.0,80.0,1.020,1.0,0.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2
1,7.0,50.0,1.020,4.0,0.0,,18.0,0.8,,,11.3,38.0,6000.0,
2,62.0,80.0,1.010,2.0,3.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9
4,51.0,80.0,1.010,2.0,0.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,140.0,49.0,0.5,150.0,4.9,15.7,47.0,6700.0,4.9
396,42.0,70.0,1.025,0.0,0.0,75.0,31.0,1.2,141.0,3.5,16.5,54.0,7800.0,6.2
397,12.0,80.0,1.020,0.0,0.0,100.0,26.0,0.6,137.0,4.4,15.8,49.0,6600.0,5.4
398,17.0,60.0,1.025,0.0,0.0,114.0,50.0,1.0,135.0,4.9,14.2,51.0,7200.0,5.9


In [22]:
numeric_imputation_mapper

DataFrameMapper(df_out=True,
                features=[(['age'], SimpleImputer(strategy='median')),
                          (['bp'], SimpleImputer(strategy='median')),
                          (['sg'], SimpleImputer(strategy='median')),
                          (['al'], SimpleImputer(strategy='median')),
                          (['su'], SimpleImputer(strategy='median')),
                          (['pot'], SimpleImputer(strategy='median')),
                          (['hemo'], SimpleImputer(strategy='median')),
                          (['pcv'], SimpleImputer(strategy='median')),
                          (['wc'], SimpleImputer(strategy='median')),
                          (['rc'], SimpleImputer(strategy='median')),
                          (['rbc'], SimpleImputer(strategy='median')),
                          (['pc'], SimpleImputer(strategy='median')),
                          (['pcc'], SimpleImputer(strategy='median')),
                          (['ba'], SimpleImputer(strat

In [23]:
categorical_imputation_mapper

DataFrameMapper(df_out=True,
                features=[('bgr', [CategoricalImputer(), LabelBinarizer()]),
                          ('bu', [CategoricalImputer(), LabelBinarizer()]),
                          ('sc', [CategoricalImputer(), LabelBinarizer()]),
                          ('sod', [CategoricalImputer(), LabelBinarizer()]),
                          ('htn', [CategoricalImputer(), LabelBinarizer()]),
                          ('dm', [CategoricalImputer(), LabelBinarizer()]),
                          ('cad', [CategoricalImputer(), LabelBinarizer()]),
                          ('appet', [CategoricalImputer(), LabelBinarizer()]),
                          ('pe', [CategoricalImputer(), LabelBinarizer()]),
                          ('ane', [CategoricalImputer(), LabelBinarizer()])],
                input_df=True)

In [24]:
categorical_imputation_mapper2

DataFrameMapper(df_out=True,
                features=[('bgr', [CategoricalImputer(), LabelBinarizer()]),
                          ('bu', [CategoricalImputer(), LabelBinarizer()]),
                          ('sc', [CategoricalImputer(), LabelBinarizer()]),
                          ('sod', [CategoricalImputer(), LabelBinarizer()]),
                          ('htn', [CategoricalImputer(), LabelBinarizer()]),
                          ('cad', [CategoricalImputer(), LabelBinarizer()]),
                          ('appet', [CategoricalImputer(), LabelBinarizer()]),
                          ('pe', [CategoricalImputer(), LabelBinarizer()]),
                          ('ane', [CategoricalImputer(), LabelBinarizer()])],
                input_df=True)

In [26]:
categorical_imputation_mapper.fit_transform(X)

Unnamed: 0,bgr,bu,sc,sod,htn,dm,cad,appet,pe,ane
0,1,1,0,0,1,1,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0
2,1,1,0,0,0,1,0,1,0,1
3,1,0,1,0,1,0,0,1,1,1
4,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
395,1,1,0,0,0,0,0,0,0,0
396,1,1,0,0,0,0,0,0,0,0
397,1,1,0,0,0,0,0,0,0,0
398,1,1,0,0,0,0,0,0,0,0


In [27]:
numeric_imputation_mapper.fit_transform(X).isnull().sum()

age     0
bp      0
sg      0
al      0
su      0
pot     0
hemo    0
pcv     0
wc      0
rc      0
rbc     0
pc      0
pcc     0
ba      0
dtype: int64

In [28]:
## FeatureUnion combines the feature spaces
numeric_categorical_union = FeatureUnion(
    [("num_mapper",numeric_imputation_mapper),
     ("cat_mapper",categorical_imputation_mapper)
        
    ]
)

In [29]:
XX = numeric_categorical_union.fit_transform(X) ##<-result in an array

In [30]:
XX.shape


(400, 24)

In [31]:
pipeline = Pipeline(
    [
        ("featureunion",numeric_categorical_union),
        ("clf",xgb.XGBClassifier(max_depth=5))
        
    ]
)

In [32]:
cross_val_scores = cross_val_score(pipeline,
                                    X,y,
                                    scoring="roc_auc",
                                    cv=3)









In [33]:
print("3-fold AUC: ", np.mean(cross_val_scores))

3-fold AUC:  0.9980761139797284


Tuning the hyperparameter

In [46]:
param_grid = {'clf__learning_rate':np.arange(0.05,1,0.05),
              'clf__n_estimators':range(50,200,50),
              'clf__max_depth':range(3,10,1)
    
}

randomized_roc_auc = RandomizedSearchCV(pipeline,
                                      param_distributions=param_grid,
                                      n_iter=3,
                                      scoring='roc_auc',
                                      verbose=1
                                      )

randomized_roc_auc.fit(X,y)

print(randomized_roc_auc.best_estimator_)
print(randomized_roc_auc.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




















































Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('num_mapper',
                                                 DataFrameMapper(df_out=True,
                                                                 features=[(['age'],
                                                                            SimpleImputer(strategy='median')),
                                                                           (['bp'],
                                                                            SimpleImputer(strategy='median')),
                                                                           (['sg'],
                                                                            SimpleImputer(strategy='median')),
                                                                           (['al'],
                                                                            SimpleImputer(strategy='median')),
                                             