### ランダムフォレスト　パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import tree


  return f(*args, **kwds)


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk1')

In [3]:
train_pkl.shape

(55175, 12)

In [4]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [5]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:-1].values
train_y = train.y.values

### 検証するパラメータの設定

In [41]:
grid_param = {
    'n_estimators': [60, 80, 100, 120],
    'criterion': ['mse'],
    'max_depth': [i for i in range(1, 11, 2)],
    'min_samples_split': [i for i in range(2, 11, 2)],
    'min_samples_leaf': [i for i in range(1, 11, 2)],
    'random_state': [42]
}


In [45]:
grid_param = {
    'n_estimators': [60, 60, 80, 90, 100],
    'criterion': ['mse'],
    'max_depth': [5, 6, 7, 9],
    'min_samples_split': [i for i in range(2, 11, 2)],
    'min_samples_leaf': [i for i in range(1, 11, 2)],
    'random_state': [42]
}


In [46]:
def rmse(y, y_pred):
    ret = np.sqrt(mean_squared_error(y, y_pred))
    print("RMSE=", ret)
    return ret

### グリッドサーチ

In [47]:
gs = GridSearchCV(estimator=RandomForestRegressor(**grid_param), param_grid=grid_param, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [48]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion=['mse'],
                                             max_depth=[5, 6, 7, 9],
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=[1, 3, 5, 7, 9],
                                             min_samples_split=[2, 4, 6, 8, 10],
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=[60, 60, 8...
                                             warm_start=False),
             iid='deprecated', n_jobs=None,
   

### 一番良かったパラメータで学習

In [49]:
gs.best_score_

-15776.774448573027

In [50]:
gs.best_params_

{'criterion': 'mse',
 'max_depth': 9,
 'min_samples_leaf': 7,
 'min_samples_split': 2,
 'n_estimators': 80,
 'random_state': 42}

In [51]:
RF = RandomForestRegressor(**gs.best_params_)

In [52]:
RF = RF.fit(train_X, train_y)

In [53]:
RF.feature_importances_

array([1.21140971e-01, 2.57377091e-01, 4.62388362e-01, 2.11461443e-02,
       2.59356629e-02, 3.16661454e-04, 1.30821263e-02, 8.11518155e-03,
       7.39561427e-02, 1.65416563e-02])

In [54]:
sorted(
    zip(map(lambda x: round(x, 3), RF.feature_importances_), train.iloc[:, 1:-1].columns),
    reverse=True)

[(0.462, 'bedrooms'),
 (0.257, 'bathrooms'),
 (0.121, 'accommodates'),
 (0.074, 'number_of_reviews'),
 (0.026, 'cleaning_fee'),
 (0.021, 'beds'),
 (0.017, 'review_scores_rating'),
 (0.013, 'host_identity_verified'),
 (0.008, 'instant_bookable'),
 (0.0, 'host_has_profile_pic')]

In [55]:
# 訓練済みの決定木を視覚化
# dot_data = tree.export_graphviz(RF, out_file=None,
#                                feature_names=train.iloc[:, 1:].columns,
#                                class_names=train.Survived.name,
#                                rounded=True,
#                                filled=True,
#                                special_characters=True)

### テストデータで予測

In [56]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:-1].values
test_y = test.y.values

In [57]:
test_x.shape, test_y.shape

((11035, 10), (11035,))

In [58]:
pred_y = RF.predict(test_x)

In [59]:
np.sqrt(mean_squared_error(test_y, pred_y))

132.4883249387457

### 検証データで予測

In [28]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk1')

In [29]:
valid.shape

(18528, 11)

In [31]:
# ID の保存
valid_pass = valid.id.values

In [32]:
valid_X = valid.iloc[:, 1:]

In [33]:
valid_X.describe()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,cleaning_fee,host_has_profile_pic,host_identity_verified,instant_bookable,number_of_reviews,review_scores_rating
count,18528.0,18528.0,18528.0,18528.0,18528.0,18528.0,18528.0,18528.0,18528.0,18528.0
mean,3.161863,1.189713,1.266947,1.706822,0.266947,0.003022,0.32405,0.735427,20.790425,94.477386
std,2.155397,0.539284,0.86016,1.242269,0.442377,0.054895,0.468031,0.441117,37.457273,7.037526
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0
25%,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,93.0
50%,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,6.0,96.0
75%,4.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,23.0,99.0
max,16.0,8.0,10.0,16.0,1.0,1.0,1.0,1.0,451.0,100.0


In [34]:
valid_X.shape, train_X.shape

((18528, 10), (44140, 10))

In [35]:
pred_valid_y = RF.predict(valid_X)

In [36]:
pred_valid_y.shape

(18528,)

In [37]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [38]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [39]:
result_df.to_csv("./RF_2.csv", header=False)