# Feature Importance

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [20]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xg 
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier # random_state=0

In [2]:
dataSet = pd.read_csv('CKD.csv')

In [3]:
dataSet.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,4.627244,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,9.0,55.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,1.125356,223.5,53.0,1.8,137.528754,4.627244,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.0075,4.0,0.0,117.0,56.0,3.8,126.0,2.8,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,4.627244,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [6]:
dataSet['classification'].value_counts()

ckd       248
notckd    150
ckd\t       2
Name: classification, dtype: int64

In [7]:
dataSet.loc[dataSet['classification'] == 'ckd', 'classification'] = 'yes'
dataSet.loc[dataSet['classification'] == 'notckd', 'classification'] = 'no'
dataSet.loc[dataSet['classification'] == 'ckd\t', 'classification'] = 'yes'

In [9]:
dataSet['classification'].value_counts()

yes    250
no     150
Name: classification, dtype: int64

In [10]:
dataSetOne = dataSet

In [11]:
dataSetOne = pd.get_dummies(dataSetOne, drop_first = True)

In [12]:
dataSetOne.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,dm_\tyes,dm_ yes,dm_no,dm_yes,cad_no,cad_yes,appet_poor,pe_yes,ane_yes,classification_yes
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,4.627244,...,0,0,0,1,1,0,0,0,0,1
1,9.0,55.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,...,0,0,1,0,1,0,0,0,0,1
2,62.0,80.0,1.01,2.0,1.125356,223.5,53.0,1.8,137.528754,4.627244,...,0,0,0,1,1,0,1,0,1,1
3,48.0,70.0,1.0075,4.0,0.0,117.0,56.0,3.8,126.0,2.8,...,0,0,1,0,1,0,1,1,1,1
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,4.627244,...,0,0,1,0,1,0,0,0,0,1


In [13]:
independent = dataSetOne.drop('classification_yes', 1) 
dependent = dataSetOne['classification_yes']

In [14]:
independent.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,htn_yes,dm_\tyes,dm_ yes,dm_no,dm_yes,cad_no,cad_yes,appet_poor,pe_yes,ane_yes
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,4.627244,...,1,0,0,0,1,1,0,0,0,0
1,9.0,55.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,...,0,0,0,1,0,1,0,0,0,0
2,62.0,80.0,1.01,2.0,1.125356,223.5,53.0,1.8,137.528754,4.627244,...,0,0,0,0,1,1,0,1,0,1
3,48.0,70.0,1.0075,4.0,0.0,117.0,56.0,3.8,126.0,2.8,...,1,0,0,1,0,1,0,1,1,1
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,4.627244,...,0,0,0,1,0,1,0,0,0,0


In [15]:
dependent.head()

0    1
1    1
2    1
3    1
4    1
Name: classification_yes, dtype: uint8

In [109]:
algorithms = [
    (xg.XGBClassifier(), 'XG Boost'),
    (GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), 'Gradient Boosting'),
    (DecisionTreeClassifier(random_state=0), 'DecisionTree'),
    (RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0), 'Random Forest')
]

# Split the data train and test

In [110]:
def SplitTrainTest(X, Y):
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = 0.25, random_state = 0)
    return xTrain, xTest, yTrain, yTest

In [123]:
xTrain, xTest, yTrain, yTest = SplitTrainTest(independent, dependent)

# Feature importance score

In [126]:
def FeatureImportance(model, xTrain, yTrain):
    for classifier, modelName in model:
        classifier.fit(xTrain, yTrain)
        print(modelName)
        importantDf = pd.DataFrame({"Feature Importance score": classifier.feature_importances_ },
                  index= xTrain.columns)
        importantDf = importantDf[importantDf['Feature Importance score'] != 0.0]
        importantDf = importantDf.sort_values(by = "Feature Importance score", ascending = False)
        print(importantDf.head(10))
        print('*' * 50)

In [127]:
FeatureImportance(algorithms, xTrain, yTrain)

XG Boost
         Feature Importance score
hemo                     0.395806
sg                       0.252220
htn_yes                  0.097161
sc                       0.089254
al                       0.078873
sod                      0.037808
dm_no                    0.021208
bgr                      0.009079
pcv_41                   0.007285
rc_5.2                   0.004314
**************************************************
Gradient Boosting
         Feature Importance score
hemo                     0.816199
sg                       0.116852
bgr                      0.026530
al                       0.012973
rc_6.5                   0.006758
sc                       0.005320
dm_yes                   0.003345
wc_6800                  0.002343
rc_5.2                   0.001895
wc_2200                  0.001543
**************************************************
DecisionTree
         Feature Importance score
hemo                     0.708677
sg                       0.167168
htn_yes 