## 決定木デフォルト

In [1]:
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.metrics import  mean_squared_error
from sklearn.model_selection import train_test_split

import graphviz

In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk3')

In [3]:
train_pkl.shape

(55323, 12)

In [4]:
train_pkl.describe()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,host_response_rate,number_of_reviews,y,bed_type_flag,property_type_num,cancel_policy_flag,city_flag
count,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0
mean,27787.181588,0.08736,0.248414,0.111274,0.238273,2.3118360000000003e-17,0.204165,160.307341,1.950943,0.999259,0.001826,0.295103
std,16043.404061,0.790027,0.782466,0.79416,0.712608,1.000009,0.752675,168.266655,0.295932,0.075455,0.047878,0.616145
min,0.0,-1.0,-2.0,-1.0,-1.0,-1.651052,0.0,1.0,0.0,0.0,0.0,0.0
25%,13892.5,0.0,0.0,0.0,0.0,-1.651052,0.0,74.0,2.0,1.0,0.0,0.0
50%,27778.0,0.0,0.0,0.0,0.0,0.6706177,0.0,111.0,2.0,1.0,0.0,0.0
75%,41681.5,0.0,0.0,0.0,0.0,0.6706177,0.0,185.0,2.0,1.0,0.0,0.0
max,55582.0,5.0,11.0,10.0,12.0,0.6706177,15.0,1999.0,2.0,2.0,2.0,2.0


In [5]:
train_pkl.head()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,host_response_rate,number_of_reviews,y,bed_type_flag,property_type_num,cancel_policy_flag,city_flag
0,0,1,1,0,1,-1.651052,0,138.0,2,1,0,0
1,1,0,0,0,0,0.670618,0,42.0,2,1,0,1
2,2,0,1,0,0,0.670618,0,65.0,2,1,0,0
3,3,0,0,0,0,0.670618,0,166.0,2,1,0,2
4,4,0,0,0,0,0.670618,0,165.0,2,1,0,0


## 訓練データとテストデータに分割

In [6]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [7]:
# ターゲットと特徴量の分割
train_X_tmp = train.copy()
train_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
train_X = train_X_tmp.iloc[:].values
train_y = train.y.values

In [8]:
train_X.shape, train_y.shape, train_X_tmp.shape

((44258, 10), (44258,), (44258, 10))

## 訓練データで訓練

In [9]:
DT = tree.DecisionTreeRegressor(max_depth=5, random_state=42)

In [10]:
DT = DT.fit(train_X, train_y)

In [11]:
train.y.name

'y'

In [12]:
DT.feature_importances_

array([0.12166478, 0.22651355, 0.52905377, 0.00453862, 0.05125604,
       0.        , 0.        , 0.00316496, 0.        , 0.06380828])

In [13]:
# train_X.columns

In [16]:
# 特徴量の重要度が高い順に表示
print("特徴量の重要度が高い順：")
# sorted：reverse=True 降順
print(sorted(
    zip(map(lambda x: round(x, 3), DT.feature_importances_), train.iloc[:, [1, 2, 3, 4, 5, 6, 8, 9, 10, 11]].columns),
    reverse=True))

特徴量の重要度が高い順：
[(0.529, 'bedrooms'), (0.227, 'bathrooms'), (0.122, 'accommodates'), (0.064, 'city_flag'), (0.051, 'host_response_rate'), (0.005, 'beds'), (0.003, 'property_type_num'), (0.0, 'number_of_reviews'), (0.0, 'cancel_policy_flag'), (0.0, 'bed_type_flag')]


In [None]:
# len(train.iloc[:, 1:].columns), train.iloc[:, 1:].columns

In [None]:
# 訓練済みの決定木を視覚化
# dot_data = tree.export_graphviz(DT, out_file=None,
#                                feature_names=train.iloc[:, 1:-1].columns,
#                                class_names=train.y.name,
#                                rounded=True,
#                                filled=True,
#                                special_characters=True)

In [None]:
# graph = graphviz.Source(dot_data)
# graph

In [None]:
# graph.write('.\DT.png')

In [None]:
type(train_X)

In [17]:
sorted(
    zip(map(lambda x: round(x, 3), DT.feature_importances_), train.iloc[:, [1, 2, 3, 4, 5, 6, 8, 9, 10, 11]].columns),
    reverse=True)

[(0.529, 'bedrooms'),
 (0.227, 'bathrooms'),
 (0.122, 'accommodates'),
 (0.064, 'city_flag'),
 (0.051, 'host_response_rate'),
 (0.005, 'beds'),
 (0.003, 'property_type_num'),
 (0.0, 'number_of_reviews'),
 (0.0, 'cancel_policy_flag'),
 (0.0, 'bed_type_flag')]

## テストデータで実行

In [18]:
# ターゲットと特徴量の分割
test_X_tmp = test.copy()
test_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
test_x = test_X_tmp.iloc[:].values
test_y = test.y.values

In [19]:
test_x.shape, test_y.shape

((11065, 10), (11065,))

In [20]:
pred_y = DT.predict(test_x)

In [21]:
pred_y.shape

(11065,)

In [22]:
np.sqrt(mean_squared_error(test_y, pred_y))

135.82025247872855

In [23]:
test.head()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,host_response_rate,number_of_reviews,y,bed_type_flag,property_type_num,cancel_policy_flag,city_flag
43933,43933,0,0,0,0,0.670618,0,75.0,2,1,0,1
277,277,0,0,0,0,0.670618,0,79.0,2,1,0,0
1799,1799,0,0,0,0,0.670618,2,41.0,0,1,0,0
49693,49693,0,0,-1,0,0.438451,0,145.0,2,1,0,0
14870,14870,1,1,0,2,0.670618,0,175.0,2,1,0,0


## 検証データで実行

In [None]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [None]:
valid.shape

In [None]:
# ID の保存
valid_pass = valid.id.values

In [None]:
valid_X = valid.iloc[:, 1:].values

In [None]:
# valid_X.describe()

In [None]:
valid_X.shape, train_X.shape

In [None]:
pred_valid_y = DT.predict(valid_X)

In [None]:
pred_valid_y.shape

In [None]:
type(valid_pass), type(pred_valid_y)

In [None]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [None]:
result_df.head()

In [None]:
result_df.to_csv("./tree_8.csv", header=False)