## ランダムフォレスト　デフォルト

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import tree


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk3')

In [3]:
train_pkl.shape

(55323, 13)

In [4]:
train_pkl.describe()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,number_of_reviews,y,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,room_type_Private room,room_type_Shared room
count,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0,55323.0
mean,27787.181588,0.08736,0.228404,0.111274,0.238273,0.217685,160.307341,0.257253,0.437467,0.001356,0.000235,0.413806,0.028975
std,16043.404061,0.790027,0.781828,0.79416,0.712608,0.858604,168.266655,0.437124,0.496079,0.036795,0.015328,0.492519,0.167739
min,0.0,-1.0,-2.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13892.5,0.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27778.0,0.0,0.0,0.0,0.0,0.0,111.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,41681.5,0.0,0.0,0.0,0.0,0.0,185.0,1.0,1.0,0.0,0.0,1.0,0.0
max,55582.0,5.0,12.0,10.0,12.0,19.0,1999.0,1.0,1.0,1.0,1.0,1.0,1.0


## 訓練データとテストデータに分割

In [5]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [6]:
# ターゲットと特徴量の分割
train_X_tmp = train.copy()
train_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
train_X = train_X_tmp.iloc[:].values
train_y = train.y.values

## 訓練データで訓練

In [7]:
RF = RandomForestRegressor(random_state=42)

In [8]:
RF = RF.fit(train_X, train_y)

In [9]:
RF.feature_importances_

array([7.83895787e-02, 2.08569196e-01, 4.32407445e-01, 5.37340303e-02,
       1.26017557e-02, 2.02331733e-02, 2.12559475e-02, 3.23424498e-04,
       1.29561495e-03, 1.35467020e-01, 3.57228147e-02])

In [11]:
sorted(
    zip(map(lambda x: round(x, 3), RF.feature_importances_), train.iloc[:, [2, 3, 4, 5, 7, 8, 9, 10, 11]].columns),
    reverse=True)

[(0.432, 'beds'),
 (0.209, 'bedrooms'),
 (0.078, 'bathrooms'),
 (0.054, 'number_of_reviews'),
 (0.021, 'cancellation_policy_super_strict_30'),
 (0.02, 'cancellation_policy_strict'),
 (0.013, 'cancellation_policy_moderate'),
 (0.001, 'room_type_Private room'),
 (0.0, 'cancellation_policy_super_strict_60')]

In [12]:
# 訓練済みの決定木を視覚化
# dot_data = tree.export_graphviz(RF, out_file=None,
#                                feature_names=train.iloc[:, 1:].columns,
#                                class_names=train.Survived.name,
#                                rounded=True,
#                                filled=True,
#                                special_characters=True)

## テストデータで実行

In [13]:
# ターゲットと特徴量の分割
test_X_tmp = test.copy()
test_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
test_x = test_X_tmp.iloc[:].values
test_y = test.y.values

In [14]:
test_x.shape, test_y.shape

((11065, 11), (11065,))

In [15]:
pred_y = RF.predict(test_x)

In [16]:
np.sqrt(mean_squared_error(test_y, pred_y))

132.7941273879188

## 検証データで実行

In [17]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [18]:
valid.shape

(18528, 12)

In [19]:
# ID の保存
valid_pass = valid.id.values

In [20]:
valid_X = valid.iloc[:, 1:].values

In [21]:
# valid_X.describe()

In [22]:
valid_X.shape, train_X.shape

((18528, 11), (44258, 11))

In [23]:
pred_valid_y = RF.predict(valid_X)

In [24]:
pred_valid_y.shape

(18528,)

In [25]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [26]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [27]:
result_df.head()

Unnamed: 0,y
0,315.4703
1,160.041064
2,125.605443
3,174.596701
4,174.596701


In [28]:
result_df.to_csv("./RF_7.csv", header=False)