# [머신러닝의 해석] 3편. Permutation Feature Importance

사용한 데이터: [Adult Census Income](https://www.kaggle.com/uciml/adult-census-income) data


데이터 전처리는 [이곳](https://github.com/Soohee410/Interpretable-Machine-Learning/blob/master/%5B%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D%EC%9D%98%20%ED%95%B4%EC%84%9D%5D%20Adult%20Census%20income%20%EB%8D%B0%EC%9D%B4%ED%84%B0%20%EC%A0%84%EC%B2%98%EB%A6%AC.ipynb) 에 있습니다.

In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
matplotlib.rcParams['figure.dpi'] = 100
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv("adult_new.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,82,Private,132870,HS-grad,2,Separate,Exec-managerial,Not-in-family,White,Female,0,4356,18,US,<=50K
1,54,Private,140359,HS-under,1,Separate,Machine-op-inspct,Unmarried,White,Female,0,3900,40,US,<=50K
2,41,Private,264663,College,3,Separate,Prof-specialty,Own-child,White,Female,0,3900,40,US,<=50K
3,34,Private,216864,HS-grad,2,Separate,Other-service,Unmarried,White,Female,0,3770,45,US,<=50K
4,38,Private,150601,HS-under,1,Separate,Adm-clerical,Unmarried,White,Male,0,3770,40,US,<=50K


In [4]:
y=df.income
X=df.drop(['income','fnlwgt','education'],axis=1)

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y) 

for i in ['workclass','marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']:
    X[i]=le.fit_transform(X[i])

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=123, stratify=y)

## (1) XGBoost

In [5]:
import xgboost 
from xgboost import XGBClassifier
from sklearn import metrics

xgb = XGBClassifier(random_state=0, n_estimators=300).fit(X_train, y_train)

print('train accuracy: ',metrics.accuracy_score(y_train, xgb.predict(X_train)))
print('test accuracy: ', metrics.accuracy_score(y_test, xgb.predict(X_test)))

train accuracy:  0.9193750871809179
test accuracy:  0.860381861575179


In [6]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(xgb, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.0522  ± 0.0059,marital.status
0.0504  ± 0.0034,capital.gain
0.0266  ± 0.0041,education.num
0.0197  ± 0.0035,age
0.0171  ± 0.0058,occupation
0.0149  ± 0.0017,capital.loss
0.0115  ± 0.0007,hours.per.week
0.0059  ± 0.0031,relationship
0.0056  ± 0.0025,workclass
0.0013  ± 0.0013,sex


## (2) CatBoost

In [8]:
from catboost import CatBoostClassifier

cb= CatBoostClassifier(silent=True, random_state=0, n_estimators=300).fit(X_train, y_train)
print('train set accuracy:', metrics.accuracy_score(y_train, cb.predict(X_train)))
print('test set accuracy:', metrics.accuracy_score(y_test, cb.predict(X_test)))

train set accuracy: 0.8931975635839494
test set accuracy: 0.8677587329138642


In [9]:
perm = PermutationImportance(cb, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.0498  ± 0.0049,marital.status
0.0487  ± 0.0040,capital.gain
0.0291  ± 0.0063,education.num
0.0176  ± 0.0015,age
0.0166  ± 0.0034,occupation
0.0136  ± 0.0013,capital.loss
0.0095  ± 0.0024,hours.per.week
0.0046  ± 0.0022,workclass
0.0027  ± 0.0011,relationship
0.0008  ± 0.0009,native.country


## (3) RandomForest

###  과적합시킨 랜덤 포레스트

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

rf = RandomForestClassifier(n_estimators=300, random_state=0).fit(X_train, y_train)
print('train set accuracy: ',metrics.accuracy_score(y_train, rf.predict(X_train)))
print('test set accuracy: ', metrics.accuracy_score(y_test, rf.predict(X_test)))

train set accuracy:  0.9793090621658065
test set accuracy:  0.8438923844651768


In [11]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(rf, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.0464  ± 0.0033,capital.gain
0.0256  ± 0.0075,education.num
0.0213  ± 0.0050,marital.status
0.0123  ± 0.0027,age
0.0112  ± 0.0012,capital.loss
0.0097  ± 0.0062,occupation
0.0083  ± 0.0023,hours.per.week
0.0058  ± 0.0026,relationship
0.0032  ± 0.0020,workclass
0.0008  ± 0.0012,native.country


### max-depth=15 설정 후 랜덤 포레스트

In [13]:
rf1 = RandomForestClassifier(n_estimators=300, random_state=0, max_depth=15).fit(X_train, y_train)
print('train set accuracy: ',metrics.accuracy_score(y_train, rf1.predict(X_train)))
print('test set accuracy: ', metrics.accuracy_score(y_test, rf1.predict(X_test)))

train set accuracy:  0.9086808945924583
test set accuracy:  0.8600564113690605


In [14]:
perm = PermutationImportance(rf1, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.0468  ± 0.0036,capital.gain
0.0378  ± 0.0058,education.num
0.0253  ± 0.0054,marital.status
0.0152  ± 0.0034,age
0.0115  ± 0.0011,capital.loss
0.0108  ± 0.0022,relationship
0.0084  ± 0.0031,occupation
0.0079  ± 0.0014,hours.per.week
0.0038  ± 0.0013,workclass
0.0001  ± 0.0005,race


### max-depth=10 설정 후 랜덤 포레스트 

In [15]:
rf2 = RandomForestClassifier(n_estimators=300, random_state=0, max_depth=10).fit(X_train, y_train)
print('train set accuracy: ',metrics.accuracy_score(y_train, rf2.predict(X_train)))
print('test set accuracy: ', metrics.accuracy_score(y_test, rf2.predict(X_test)))

train set accuracy:  0.8712512205328498
test set accuracy:  0.8568019093078759


In [16]:
perm2 = PermutationImportance(rf2, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm2, feature_names = X_test.columns.tolist())

Weight,Feature
0.0484  ± 0.0041,capital.gain
0.0465  ± 0.0036,education.num
0.0176  ± 0.0044,marital.status
0.0136  ± 0.0031,relationship
0.0124  ± 0.0016,age
0.0110  ± 0.0009,capital.loss
0.0062  ± 0.0008,hours.per.week
0.0027  ± 0.0015,occupation
0.0021  ± 0.0008,workclass
0.0018  ± 0.0008,sex


### max-depth=8 랜덤 포레스트

In [18]:
rf3 = RandomForestClassifier(n_estimators=300, random_state=0, max_depth=8).fit(X_train, y_train)
print('train set accuracy: ',metrics.accuracy_score(y_train, rf3.predict(X_train)))
print('test set accuracy: ', metrics.accuracy_score(y_test, rf3.predict(X_test)))

train set accuracy:  0.8588831543218487
test set accuracy:  0.8525710566283359


In [20]:
perm3 = PermutationImportance(rf3, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm3, feature_names = X_test.columns.tolist())

Weight,Feature
0.0481  ± 0.0035,capital.gain
0.0465  ± 0.0045,education.num
0.0194  ± 0.0027,marital.status
0.0113  ± 0.0008,capital.loss
0.0109  ± 0.0016,age
0.0101  ± 0.0025,relationship
0.0038  ± 0.0018,hours.per.week
0.0006  ± 0.0006,occupation
0.0004  ± 0.0002,native.country
0.0002  ± 0.0004,sex


### max-depth=6 랜덤 포레스트

In [21]:
rf4 = RandomForestClassifier(n_estimators=300, random_state=0, max_depth=6).fit(X_train, y_train)
print('train set accuracy: ',metrics.accuracy_score(y_train, rf4.predict(X_train)))
print('test set accuracy: ', metrics.accuracy_score(y_test, rf4.predict(X_test)))

train set accuracy:  0.853629051006649
test set accuracy:  0.8496420047732697


In [22]:
perm4 = PermutationImportance(rf4, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm4, feature_names = X_test.columns.tolist())

Weight,Feature
0.0491  ± 0.0036,capital.gain
0.0470  ± 0.0042,education.num
0.0240  ± 0.0027,marital.status
0.0148  ± 0.0032,relationship
0.0101  ± 0.0014,age
0.0100  ± 0.0013,capital.loss
0.0023  ± 0.0020,hours.per.week
0.0003  ± 0.0002,workclass
0.0003  ± 0.0003,native.country
-0.0000  ± 0.0005,sex
