### ランダムフォレスト　パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import tree


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./pd_train.pk2')

In [3]:
train_pkl.shape

(891, 15)

In [4]:
# Age, Fare は削除
# ダミー変数の先頭は削除
train_pkl.drop(['Age', 'Fare', 'Sex_female', 'Embarked_C', 'Pclass_1'], inplace=True, axis=1)

In [5]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [6]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:].values
train_y = train.Survived.values

### 検証するパラメータの設定

In [7]:
grid_param = {
    'n_estimators': [10, 30, 50, 70, 90, 100, 110, 120],
    'criterion': ['gini', 'entropy'],
    'max_depth': [i for i in range(1, 11)],
    'min_samples_split': [i for i in range(2, 11)],
    'min_samples_leaf': [i for i in range(1, 11)],
    'random_state': [42]
}


### グリッドサーチ

In [10]:
gs = GridSearchCV(estimator=RandomForestClassifier(**grid_param), param_grid=grid_param, scoring='accuracy', cv=5, return_train_score=False)

In [11]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion=['gini', 'entropy'],
                                              max_depth=[1, 2, 3, 4, 5, 6, 7, 8,
                                                         9, 10],
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=[1, 2, 3, 4, 5,
                                                                6, 7, 8, 9,
                                                                10],
                                 

### 一番良かったパラメータで学習

In [12]:
gs.best_score_

0.8356544863587118

In [13]:
gs.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 110,
 'random_state': 42}

In [14]:
RF = RandomForestClassifier(**gs.best_params_)

In [15]:
RF = RF.fit(train_X, train_y)

In [16]:
RF.feature_importances_

array([0.05461628, 0.04266538, 0.08226985, 0.1278167 , 0.49934481,
       0.0085531 , 0.03051996, 0.02759766, 0.12661626])

In [17]:
sorted(
    zip(map(lambda x: round(x, 3), RF.feature_importances_), train.iloc[:, 1:].columns),
    reverse=True)

[(0.499, 'Sex_male'),
 (0.128, 'Fare_bin'),
 (0.127, 'Pclass_3'),
 (0.082, 'Age_bin'),
 (0.055, 'SibSp'),
 (0.043, 'Parch'),
 (0.031, 'Embarked_S'),
 (0.028, 'Pclass_2'),
 (0.009, 'Embarked_Q')]

In [18]:
# 訓練済みの決定木を視覚化
# dot_data = tree.export_graphviz(RF, out_file=None,
#                                feature_names=train.iloc[:, 1:].columns,
#                                class_names=train.Survived.name,
#                                rounded=True,
#                                filled=True,
#                                special_characters=True)

### テストデータで予測

In [19]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:].values
test_y = test.Survived.values

In [20]:
test_x.shape, test_y.shape

((179, 9), (179,))

In [21]:
pred_y = RF.predict(test_x)

In [22]:
confusion_matrix(test_y, pred_y)

array([[96,  9],
       [23, 51]], dtype=int64)

In [23]:
accuracy_score(test_y, pred_y)

0.8212290502793296

### 検証データで予測

In [24]:
# 検証データ読み込み
valid = pd.read_pickle('./pd_test.pk2')

In [25]:
valid.shape

(418, 15)

In [26]:
# ID の保存
valid_pass = valid.PassengerId.values

In [27]:
valid_X = valid.iloc[:, 1:]

In [28]:
valid_X.describe()

Unnamed: 0,Age,SibSp,Parch,Fare,Age_bin,Fare_bin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,29.599282,0.447368,0.392344,35.576535,2.476077,1.023923,0.363636,0.636364,0.244019,0.110048,0.645933,0.255981,0.222488,0.521531
std,12.70377,0.89676,0.981429,55.850103,1.301411,2.213844,0.481622,0.481622,0.430019,0.313324,0.478803,0.436934,0.416416,0.500135
min,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,14.4542,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,35.75,1.0,0.0,31.471875,3.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
max,76.0,8.0,9.0,512.3292,7.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
valid_X.drop(['Age', 'Fare', 'Sex_female', 'Embarked_C', 'Pclass_1'], inplace=True, axis=1)

In [30]:
valid_X.shape, train_X.shape

((418, 9), (712, 9))

In [31]:
pred_valid_y = RF.predict(valid_X)

In [32]:
pred_valid_y.shape

(418,)

In [33]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [34]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['Survived'])

In [35]:
result_df.to_csv("./RF_3.csv", index_label='PassengerId')