### 決定木　パラメータチューニング

In [143]:
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import  mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree.export import export_text
from sklearn import preprocessing

import graphviz

In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk1')

In [3]:
pd.__version__

'1.0.3'

In [4]:
train_pkl.shape

(55175, 12)

In [5]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [6]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:-1].values
train_y = train.y.values

### 検証するパラメータの設定

In [7]:
grid_param = {
    'criterion': ['mse'],
    'splitter': ['best', 'random'],
    'max_depth': [i for i in range(1, 11, 2)],
    'min_samples_split': [i for i in range(2, 11, 2)],
    'min_samples_leaf': [i for i in range(1, 11, 2)],
    'random_state': [42]
}


In [144]:
grid_param = {
    'criterion': ['mse'],
    'splitter': ['best', 'random'],
    'max_depth': [i for i in range(5, 6)],
    'min_samples_split': [i for i in range(2, 11, 2)],
    'min_samples_leaf': [i for i in range(1, 11, 2)],
    'random_state': [42]
}


In [120]:
def rmse(y, y_pred):
    ret = np.sqrt(mean_squared_error(y, y_pred))
    print("RMSE=", ret)
    return ret

### グリッドサーチ

In [145]:
gs = GridSearchCV(estimator=DecisionTreeRegressor(**grid_param), param_grid=grid_param, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [146]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion=['mse'],
                                             max_depth=[5], max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=[1, 3, 5, 7, 9],
                                             min_samples_split=[2, 4, 6, 8, 10],
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=[42],
                                             splitter=['best', 'random']),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['mse'], 'max_depth': [5],
                         'min_samples_leaf': [1,

### 一番良かったパラメータで学習

In [147]:
gs.best_score_

-16616.317990448915

In [149]:
np.sqrt(abs(gs.best_score_))

128.90429779665578

In [150]:
gs.best_params_

{'criterion': 'mse',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 8,
 'random_state': 42,
 'splitter': 'best'}

In [151]:
# gs.cv_results_['mean_test_score']

In [152]:
DT = DecisionTreeRegressor(**gs.best_params_)

In [153]:
DT = DT.fit(train_X, train_y)

### ツリーの可視化

In [154]:
# tree.plot_tree(DT.fit(train_X, train_y))

In [155]:
# export_text(DT, feature_names=train.iloc[:, 1:].columns)

In [156]:
# train.Survived.name

In [157]:
len(train.iloc[:, 1:].columns), train.iloc[:, 1:].columns

(11,
 Index(['accommodates', 'bathrooms', 'bedrooms', 'beds', 'cleaning_fee',
        'host_has_profile_pic', 'host_identity_verified', 'instant_bookable',
        'number_of_reviews', 'review_scores_rating', 'y'],
       dtype='object'))

In [158]:
# 訓練済みの決定木を視覚化
dot_data = tree.export_graphviz(DT, out_file=None,
                               feature_names=train.iloc[:, 1:-1].columns,
                               class_names=train.y.name,
                               rounded=True,
                               filled=True,
                               special_characters=True)

In [159]:
# graph = graphviz.Source(dot_data)
# graph

In [160]:
# graph.write('.\DT.png')

In [161]:
type(train_X)

numpy.ndarray

In [162]:
DT.feature_importances_

array([0.0815247 , 0.31561454, 0.51632759, 0.00643602, 0.00474907,
       0.        , 0.        , 0.        , 0.0712031 , 0.00414497])

In [163]:
# 特徴量の重要度が高い順に表示
print("特徴量の重要度が高い順：")
# sorted：reverse=True 降順
print(sorted(
    zip(map(lambda x: round(x, 3), DT.feature_importances_), train.iloc[:, 1:-1].columns),
    reverse=True))

特徴量の重要度が高い順：
[(0.516, 'bedrooms'), (0.316, 'bathrooms'), (0.082, 'accommodates'), (0.071, 'number_of_reviews'), (0.006, 'beds'), (0.005, 'cleaning_fee'), (0.004, 'review_scores_rating'), (0.0, 'instant_bookable'), (0.0, 'host_identity_verified'), (0.0, 'host_has_profile_pic')]


In [164]:
sorted(
    zip(map(lambda x: round(x, 3), DT.feature_importances_), train.iloc[:, 1:-1].columns),
    reverse=True)

[(0.516, 'bedrooms'),
 (0.316, 'bathrooms'),
 (0.082, 'accommodates'),
 (0.071, 'number_of_reviews'),
 (0.006, 'beds'),
 (0.005, 'cleaning_fee'),
 (0.004, 'review_scores_rating'),
 (0.0, 'instant_bookable'),
 (0.0, 'host_identity_verified'),
 (0.0, 'host_has_profile_pic')]

### テストデータで予測

In [165]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:-1].values
test_y = test.y.values

In [166]:
test_x.shape, test_y.shape

((11035, 10), (11035,))

In [167]:
pred_y = DT.predict(test_x)

In [168]:
np.sqrt(mean_squared_error(test_y, pred_y))

135.85625736135438

### 検証データで予測

In [39]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk1')

In [40]:
valid.shape

(18528, 11)

In [41]:
# ID の保存
valid_pass = valid.id.values

In [42]:
valid_X = valid.iloc[:, 1:]

In [43]:
valid_X.shape, train_X.shape

((18528, 10), (44140, 10))

In [44]:
pred_valid_y = DT.predict(valid_X)

In [45]:
pred_valid_y.shape

(18528,)

In [46]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [47]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [48]:
result_df.to_csv("./tree_3.csv", header=False)