# permutation Importance

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [48]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xg 
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
dataSet = pd.read_csv('CKD.csv')

In [4]:
dataSet.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,4.627244,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,9.0,55.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,1.125356,223.5,53.0,1.8,137.528754,4.627244,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.0075,4.0,0.0,117.0,56.0,3.8,126.0,2.8,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,4.627244,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [5]:
dataSet['classification'].value_counts()

ckd       248
notckd    150
ckd\t       2
Name: classification, dtype: int64

In [6]:
dataSet.loc[dataSet['classification'] == 'ckd', 'classification'] = 'yes'
dataSet.loc[dataSet['classification'] == 'notckd', 'classification'] = 'no'
dataSet.loc[dataSet['classification'] == 'ckd\t', 'classification'] = 'yes'

In [7]:
dataSet['classification'].value_counts()

yes    250
no     150
Name: classification, dtype: int64

In [8]:
dataSetOne = dataSet

In [9]:
dataSetOne = pd.get_dummies(dataSetOne, drop_first = True)

In [10]:
dataSetOne.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,dm_\tyes,dm_ yes,dm_no,dm_yes,cad_no,cad_yes,appet_poor,pe_yes,ane_yes,classification_yes
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,4.627244,...,0,0,0,1,1,0,0,0,0,1
1,9.0,55.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,...,0,0,1,0,1,0,0,0,0,1
2,62.0,80.0,1.01,2.0,1.125356,223.5,53.0,1.8,137.528754,4.627244,...,0,0,0,1,1,0,1,0,1,1
3,48.0,70.0,1.0075,4.0,0.0,117.0,56.0,3.8,126.0,2.8,...,0,0,1,0,1,0,1,1,1,1
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,4.627244,...,0,0,1,0,1,0,0,0,0,1


In [11]:
independent = dataSetOne.drop('classification_yes', 1) 
dependent = dataSetOne['classification_yes']

In [12]:
independent.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,htn_yes,dm_\tyes,dm_ yes,dm_no,dm_yes,cad_no,cad_yes,appet_poor,pe_yes,ane_yes
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,4.627244,...,1,0,0,0,1,1,0,0,0,0
1,9.0,55.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,...,0,0,0,1,0,1,0,0,0,0
2,62.0,80.0,1.01,2.0,1.125356,223.5,53.0,1.8,137.528754,4.627244,...,0,0,0,0,1,1,0,1,0,1
3,48.0,70.0,1.0075,4.0,0.0,117.0,56.0,3.8,126.0,2.8,...,1,0,0,1,0,1,0,1,1,1
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,4.627244,...,0,0,0,1,0,1,0,0,0,0


In [13]:
dependent.head()

0    1
1    1
2    1
3    1
4    1
Name: classification_yes, dtype: uint8

In [51]:
algorithm = [
    (xg.XGBClassifier(), 'XG Boost'),
    (GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), 'Gradient Boosting'),
    (DecisionTreeClassifier(random_state=0), 'DecisionTree'),
    (KNeighborsClassifier(), 'K Neighbors Classifier'),
    (RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0), 'Random Forest'),
    (LogisticRegression(), 'Logistic Regression'),
    (SVC(kernel = 'linear', random_state = 0), 'support vector linear'),
    (SVC(kernel = 'rbf', random_state = 0), 'rbf')
]

# Split the data train and test

In [14]:
def SplitTrainTest(X, Y):
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = 0.25, random_state = 0)
    return xTrain, xTest, yTrain, yTest

In [15]:
xTrain, xTest, yTrain, yTest = SplitTrainTest(independent, dependent)

# permuntation impotance

In [16]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)

In [17]:
classifier.fit(xTrain, yTrain)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [19]:
xTrain.shape

(300, 207)

In [55]:
for classifier, modelName in algorithm:
    print(modelName)
    classifier.fit(xTrain, yTrain)
    results = permutation_importance(classifier, xTrain, yTrain, scoring='accuracy')
    importantDf = pd.DataFrame({"Permuntaion Importance score":results.importances_mean},
                  index= xTrain.columns)
    importantDf = importantDf[importantDf['Permuntaion Importance score'] != 0.0]
    importantDf = importantDf.sort_values(by = "Permuntaion Importance score", ascending = False)
    print(importantDf.head())
    print('*' * 50)

XG Boost
      Permuntaion Importance score
hemo                      0.135333
sg                        0.134667
al                        0.016667
sc                        0.014000
bgr                       0.003333
**************************************************
Gradient Boosting
        Permuntaion Importance score
hemo                        0.046667
sg                          0.032000
pcv_35                      0.006000
**************************************************
DecisionTree
         Permuntaion Importance score
sg                           0.154667
htn_yes                      0.148667
hemo                         0.140000
sc                           0.120667
sod                          0.058667
**************************************************
K Neighbors Classifier
     Permuntaion Importance score
bgr                      0.193333
bu                       0.140667
age                      0.066667
bp                       0.034667
sod                      0.0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


      Permuntaion Importance score
hemo                      0.168667
bgr                       0.054667
sc                        0.053333
al                        0.046667
bp                        0.018667
**************************************************
support vector linear
      Permuntaion Importance score
hemo                      0.134667
sc                        0.047333
al                        0.030000
bgr                       0.015333
bu                        0.014667
**************************************************
rbf
      Permuntaion Importance score
bgr                       0.174000
bu                        0.111333
sod                       0.013333
bp                        0.011333
hemo                      0.007333
**************************************************
