### 決定木　パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import  mean_squared_error, make_scorer, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree.export import export_text
from sklearn import preprocessing

import graphviz

  return f(*args, **kwds)


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk1')

In [3]:
pd.__version__

'1.0.3'

In [4]:
train_pkl.shape

(850, 11)

In [5]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [6]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, :-1].values
train_y = train.disease.values

### 検証するパラメータの設定

In [16]:
grid_param = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [i for i in range(1, 11)],
    'min_samples_split': [i for i in range(2, 11)],
    'min_samples_leaf': [i for i in range(1, 11)],
    'random_state': [42]
}


In [144]:
# grid_param = {
#     'criterion':  ['gini', 'entropy'],
#     'splitter': ['best', 'random'],
#     'max_depth': [i for i in range(5, 6)],
#     'min_samples_split': [i for i in range(2, 11, 2)],
#     'min_samples_leaf': [i for i in range(1, 11, 2)],
#     'random_state': [42]
# }


In [19]:
# def rmse(y, y_pred):
#     ret = np.sqrt(mean_squared_error(y, y_pred))
#     print("RMSE=", ret)
#     return ret

### グリッドサーチ

In [17]:
gs = GridSearchCV(estimator=DecisionTreeClassifier(**grid_param), param_grid=grid_param, scoring='accuracy', cv=5, return_train_score=False)

In [18]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion=['gini', 'entropy'],
                                              max_depth=[1, 2, 3, 4, 5, 6, 7, 8,
                                                         9, 10],
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=[1, 2, 3, 4, 5,
                                                                6, 7, 8, 9,
                                                                10],
                                              min_samples_split=[2, 3, 4, 5, 6,
                                                                 7, 8, 9, 10],
  

### 一番良かったパラメータで学習

In [20]:
gs.best_score_

0.8264705882352942

In [21]:
gs.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'random_state': 42,
 'splitter': 'random'}

In [22]:
# gs.cv_results_['mean_test_score']

In [23]:
DT = DecisionTreeClassifier(**gs.best_params_)

In [24]:
DT = DT.fit(train_X, train_y)

### ツリーの可視化

In [154]:
# tree.plot_tree(DT.fit(train_X, train_y))

In [155]:
# export_text(DT, feature_names=train.iloc[:, 1:].columns)

In [156]:
# train.Survived.name

In [25]:
len(train.iloc[:, :-1].columns), train.iloc[:, :-1].columns

(10,
 Index(['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb',
        'AG_ratio', 'Gender_Male'],
       dtype='object'))

In [26]:
# 訓練済みの決定木を視覚化
dot_data = tree.export_graphviz(DT, out_file=None,
                               feature_names=train.iloc[:, :-1].columns,
                               class_names=train.disease.name,
                               rounded=True,
                               filled=True,
                               special_characters=True)

In [159]:
# graph = graphviz.Source(dot_data)
# graph

In [160]:
# graph.write('.\DT.png')

In [27]:
type(train_X)

numpy.ndarray

In [28]:
DT.feature_importances_

array([0.03601791, 0.31923671, 0.07087208, 0.03269852, 0.12286935,
       0.20612933, 0.04543266, 0.04340036, 0.10761065, 0.01573244])

In [29]:
# 特徴量の重要度が高い順に表示
print("特徴量の重要度が高い順：")
# sorted：reverse=True 降順
print(sorted(
    zip(map(lambda x: round(x, 3), DT.feature_importances_), train.iloc[:, :-1].columns),
    reverse=True))

特徴量の重要度が高い順：
[(0.319, 'T_Bil'), (0.206, 'AST_GOT'), (0.123, 'ALT_GPT'), (0.108, 'AG_ratio'), (0.071, 'D_Bil'), (0.045, 'TP'), (0.043, 'Alb'), (0.036, 'Age'), (0.033, 'ALP'), (0.016, 'Gender_Male')]


In [30]:
sorted(
    zip(map(lambda x: round(x, 3), DT.feature_importances_), train.iloc[:, :-1].columns),
    reverse=True)

[(0.319, 'T_Bil'),
 (0.206, 'AST_GOT'),
 (0.123, 'ALT_GPT'),
 (0.108, 'AG_ratio'),
 (0.071, 'D_Bil'),
 (0.045, 'TP'),
 (0.043, 'Alb'),
 (0.036, 'Age'),
 (0.033, 'ALP'),
 (0.016, 'Gender_Male')]

### テストデータで予測

In [32]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, :-1].values
test_y = test.disease.values

In [33]:
test_x.shape, test_y.shape

((170, 10), (170,))

In [34]:
pred_y = DT.predict(test_x)

In [35]:
confusion_matrix(test_y, pred_y)

array([[87,  6],
       [23, 54]], dtype=int64)

In [36]:
accuracy_score(test_y, pred_y)

0.8294117647058824

### 検証データで予測

In [37]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk1')

In [38]:
valid.shape

(350, 10)

In [41]:
# ID の保存
# valid_pass = valid.id.values

In [39]:
# valid_X = valid.iloc[:, 1:]
valid_X = valid.copy()

In [40]:
valid_X.shape, train_X.shape

((350, 10), (680, 10))

In [41]:
pred_valid_y = DT.predict(valid_X)

In [42]:
pred_valid_y.shape

(350,)

In [46]:
# type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [43]:
result_df = pd.DataFrame(pred_valid_y)

In [44]:
result_df.to_csv("./tree_2.csv", header=False)