### ランダムフォレスト　パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import tree


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk3')

In [3]:
train_pkl.shape

(55369, 13)

In [4]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [5]:
# ターゲットと特徴量の分割
train_X_tmp = train.copy()
train_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
train_X = train_X_tmp.iloc[:].values
train_y = train.y.values

### 検証するパラメータの設定

In [6]:
grid_param = {
    'n_estimators': [60, 80, 100, 120],
    'criterion': ['mse'],
    'max_depth': [i for i in range(1, 11, 2)],
    'min_samples_split': [i for i in range(2, 11, 2)],
    'min_samples_leaf': [i for i in range(1, 11, 2)],
    'random_state': [42]
}


In [7]:
grid_param = {
    'n_estimators': [60, 70, 80, 90, 100],
    'criterion': ['mse'],
    'max_depth': [5, 6, 7, 9],
    'min_samples_split': [i for i in range(2, 11, 2)],
    'min_samples_leaf': [i for i in range(1, 11, 2)],
    'random_state': [42]
}


In [8]:
def rmse(y, y_pred):
    ret = np.sqrt(mean_squared_error(y, y_pred))
    print("RMSE=", ret)
    return ret

### グリッドサーチ

In [9]:
gs = GridSearchCV(estimator=RandomForestRegressor(**grid_param), param_grid=grid_param, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [None]:
gs.fit(train_X, train_y)

### 一番良かったパラメータで学習

In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [None]:
RF = RandomForestRegressor(**gs.best_params_)

In [None]:
RF = RF.fit(train_X, train_y)

In [None]:
RF.feature_importances_

In [None]:
sorted(
    zip(map(lambda x: round(x, 3), RF.feature_importances_), train.iloc[:, [1, 2, 3, 4, 5, 7, 8, 9, 10, 11]].columns),
    reverse=True)

In [None]:
# 訓練済みの決定木を視覚化
# dot_data = tree.export_graphviz(RF, out_file=None,
#                                feature_names=train.iloc[:, 1:].columns,
#                                class_names=train.Survived.name,
#                                rounded=True,
#                                filled=True,
#                                special_characters=True)

### テストデータで予測

In [None]:
# ターゲットと特徴量の分割
test_X_tmp = test.copy()
test_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
test_x = test_X_tmp.iloc[:].values
test_y = test.y.values

In [None]:
test_x.shape, test_y.shape

In [None]:
pred_y = RF.predict(test_x)

In [None]:
np.sqrt(mean_squared_error(test_y, pred_y))

### 検証データで予測

In [22]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [23]:
valid.shape

(18528, 12)

In [24]:
# ID の保存
valid_pass = valid.id.values

In [25]:
valid_X = valid.iloc[:, 1:].values

In [26]:
# valid_X.describe()

In [27]:
valid_X.shape, train_X.shape

((18528, 11), (44295, 11))

In [28]:
pred_valid_y = RF.predict(valid_X)

In [29]:
pred_valid_y.shape

(18528,)

In [30]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [31]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [32]:
result_df.to_csv("./RF_5.csv", header=False)