In [1]:
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import  mean_squared_error, make_scorer, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree.export import export_text
from sklearn import preprocessing
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier


import graphviz



## ファイルから初期読み込み

In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk1')

In [3]:
train_pkl.shape

(850, 11)

## 訓練データとテストデータに分割

In [4]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [5]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, :-1]
train_y = train.disease

In [6]:
train_X.shape, train_y.shape

((680, 10), (680,))

In [7]:
type(train_X), type(train_y)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

## max_depth=7　で確認

In [14]:
RF = RandomForestClassifier(max_depth=7, random_state=42)

In [15]:
feat_sel = BorutaPy(RF, n_estimators='auto', two_step=False, verbose=2, random_state=42)

In [16]:
feat_sel.fit(train_X.values, train_y.values)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	1
Iteration: 	9 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	1
Iteration: 	10 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	1
Iteration: 	11 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	1
Iteration: 	12 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	1
Iteration: 	13 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	1
Iteration: 	14 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	1
Iteration: 	15 / 100
Confirmed: 	8
Tentative: 	0
Rejected: 	2


BorutaPy finished running.

Iteration: 	16 / 100
Confirmed: 	8
Tentati

BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight=None, criterion='gini',
                                          max_depth=7, max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=60, n_jobs=None,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x238FAE64150,
                                          verbose=0, warm_start=False),
         max_iter=100, n_estimators='aut

## 予測に使う特徴量

In [17]:
feat_sel.support_

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
       False])

In [18]:
train_X.columns[feat_sel.support_]

Index(['T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio'], dtype='object')

## 不要な特徴量

In [19]:
train_X.columns[~feat_sel.support_]

Index(['Age', 'Gender_Male'], dtype='object')

## 必要な特徴量で再実行

In [20]:
train_X_sel = train_X.iloc[:, feat_sel.support_]

In [21]:
RF = RandomForestClassifier(n_estimators=500, random_state=42)

In [22]:
RF.fit(train_X_sel.values, train_y.values)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [25]:
sorted(
    zip(map(lambda x: round(x, 3), RF.feature_importances_), train_X_sel.iloc[:, :].columns),
    reverse=True)

[(0.202, 'T_Bil'),
 (0.17, 'AST_GOT'),
 (0.156, 'ALT_GPT'),
 (0.137, 'D_Bil'),
 (0.112, 'ALP'),
 (0.101, 'AG_ratio'),
 (0.064, 'TP'),
 (0.058, 'Alb')]

In [13]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, :-1]
test_x_sel = test_x.iloc[:, feat_sel.support_]
test_y = test.disease

In [14]:
pred_y = RF.predict(test_x_sel.values)

In [15]:
confusion_matrix(test_y.values, pred_y)

array([[89,  4],
       [15, 62]], dtype=int64)

In [16]:
accuracy_score(test_y.values, pred_y)

0.888235294117647

## 検証データで実行

In [17]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk1')

In [18]:
valid.shape

(350, 10)

In [19]:
# ID の保存
# valid_pass = valid.id.values

In [20]:
valid_X = valid.copy()

In [21]:
valid_X = valid_X.iloc[:, feat_sel.support_]

In [22]:
valid_X.describe()

Unnamed: 0,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio
count,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
mean,1.631836,0.565918,282.23407,32.363522,61.545425,7.089844,3.5625,1.154297
std,2.857422,1.74707,201.307434,83.888397,114.916496,0.87793,0.607422,0.248047
min,0.609863,0.053864,175.747528,7.862773,11.278741,4.96875,2.296875,0.668945
25%,0.787109,0.147705,214.211426,13.551174,20.84291,6.724609,3.12793,1.008789
50%,0.844971,0.193848,220.738617,16.449139,25.971273,6.931641,3.621094,1.216797
75%,0.973877,0.23645,231.839767,22.760056,52.746082,7.553711,3.739258,1.288086
max,27.046875,17.703125,2101.145752,860.919067,705.777161,8.75,5.007812,1.804688


In [23]:
valid_X.shape, train_X.shape

((350, 8), (680, 10))

In [24]:
pred_valid_y = RF.predict(valid_X.values)

In [25]:
pred_valid_y.shape

(350,)

In [26]:
result_df = pd.DataFrame(pred_valid_y)

In [27]:
result_df.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [28]:
result_df.to_csv("./RF_3.csv", header=False)

In [29]:
train_X.head()

Unnamed: 0,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,Gender_Male
332,44,2.066406,0.631348,213.935623,14.563273,47.154297,7.503906,2.302734,0.759766,1
383,36,0.817871,0.197021,214.644638,15.622564,21.059477,7.511719,3.630859,1.291016,1
281,20,0.791992,0.082642,358.339508,12.924613,25.77248,8.648438,4.324219,1.319336,1
2,65,0.657227,0.081299,320.770538,12.625011,30.61318,5.949219,2.488281,0.774902,1
231,29,1.714844,0.51123,215.885971,27.66971,60.709866,5.953125,3.117188,1.21875,1


In [30]:
train_X.shape

(680, 10)

In [31]:
train_X.columns[~feat_sel.support_]

Index(['Age', 'Gender_Male'], dtype='object')

In [32]:
train_save = train_pkl.drop(columns=train_X.columns[~feat_sel.support_])

In [37]:
train.shape, train_save.shape, train_pkl.shape

((680, 11), (850, 9), (850, 11))

In [34]:
train_save.head(5)

Unnamed: 0,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease
0,0.787109,0.150513,220.178696,13.467617,21.729246,6.816406,3.111328,1.006836,0
1,1.003906,0.195679,221.218414,51.033463,64.752319,6.890625,3.050781,0.751465,1
2,0.657227,0.081299,320.770538,12.625011,30.61318,5.949219,2.488281,0.774902,0
3,0.906738,0.214233,369.278168,34.347599,54.510086,6.96875,3.613281,0.988281,1
4,1.735352,0.197754,222.782028,20.572891,170.010178,5.835938,3.068359,1.026367,0


In [39]:
valid_X.shape

(350, 8)

In [42]:
train_save.to_pickle('train.pk2')

In [None]:
valid_X.to_pickle('test.pk2')

In [44]:
train_save.head(5)

Unnamed: 0,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease
0,0.787109,0.150513,220.178696,13.467617,21.729246,6.816406,3.111328,1.006836,0
1,1.003906,0.195679,221.218414,51.033463,64.752319,6.890625,3.050781,0.751465,1
2,0.657227,0.081299,320.770538,12.625011,30.61318,5.949219,2.488281,0.774902,0
3,0.906738,0.214233,369.278168,34.347599,54.510086,6.96875,3.613281,0.988281,1
4,1.735352,0.197754,222.782028,20.572891,170.010178,5.835938,3.068359,1.026367,0


In [43]:
valid_X.head(5)

Unnamed: 0,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio
0,0.801758,0.186768,214.448685,15.260516,19.496649,6.816406,3.601562,1.288086
1,0.834961,0.116455,234.13681,10.509159,16.733311,6.796875,3.111328,1.030273
2,0.791992,0.197998,216.039902,14.578304,20.695866,7.523438,3.626953,1.27832
3,0.833984,0.175781,200.901123,12.820356,20.102516,6.828125,3.626953,1.275391
4,0.69043,0.135498,208.705841,20.279018,25.096685,8.664062,4.171875,1.280273


In [47]:
RF = RandomForestClassifier(max_depth=5, random_state=42)

In [51]:
feat_sel = BorutaPy(RF, n_estimators='auto', two_step=False, random_state=42)

In [52]:
feat_sel.fit(train_X.values, train_y.values)

BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight=None, criterion='gini',
                                          max_depth=5, max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=84, n_jobs=None,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x2BC1184FBA0,
                                          verbose=0, warm_start=False),
         max_iter=100, n_estimators='aut

In [53]:
feat_sel.support_

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
       False])

In [54]:
RF = RandomForestClassifier(max_depth=3, random_state=42)

In [55]:
feat_sel = BorutaPy(RF, n_estimators='auto', two_step=False, random_state=42)

In [56]:
feat_sel.fit(train_X.values, train_y.values)

BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight=None, criterion='gini',
                                          max_depth=3, max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=141, n_jobs=None,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x2BC1184FCA8,
                                          verbose=0, warm_start=False),
         max_iter=100, n_estimators='au

In [57]:
feat_sel.support_

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False])

In [58]:
train_X.columns[feat_sel.support_]

Index(['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb',
       'AG_ratio'],
      dtype='object')

In [59]:
train_X.columns[~feat_sel.support_]

Index(['Gender_Male'], dtype='object')

In [60]:
train_save = train_pkl.drop(columns=train_X.columns[~feat_sel.support_])

In [61]:
train_save.to_pickle('train.pk3')

In [62]:
train_save.shape

(850, 10)

In [66]:
test_save = valid.drop(columns=train_X.columns[~feat_sel.support_])

In [68]:
test_save.columns, test_save.shape

(Index(['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb',
        'AG_ratio'],
       dtype='object'),
 (350, 9))

In [69]:
test_save.to_pickle('test.pk3')