### ランダムフォレスト　パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, make_scorer, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import tree


  return f(*args, **kwds)


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk1')

In [3]:
train_pkl.shape

(850, 11)

In [4]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [5]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, :-1].values
train_y = train.disease.values

### 検証するパラメータの設定

In [6]:
grid_param = {
    'n_estimators': [60, 80, 100, 120],
    'criterion': ['gini', 'entropy'],
    'max_depth': [i for i in range(1, 11, 2)],
    'min_samples_split': [i for i in range(2, 11, 2)],
    'min_samples_leaf': [i for i in range(1, 11, 2)],
    'random_state': [42]
}


In [45]:
# grid_param = {
#     'n_estimators': [60, 60, 80, 90, 100],
#     'criterion': ['mse'],
#     'max_depth': [5, 6, 7, 9],
#     'min_samples_split': [i for i in range(2, 11, 2)],
#     'min_samples_leaf': [i for i in range(1, 11, 2)],
#     'random_state': [42]
# }


In [46]:
# def rmse(y, y_pred):
#     ret = np.sqrt(mean_squared_error(y, y_pred))
#     print("RMSE=", ret)
#     return ret

### グリッドサーチ

In [7]:
gs = GridSearchCV(estimator=RandomForestClassifier(**grid_param), param_grid=grid_param, scoring='accuracy', cv=5, return_train_score=False)

In [8]:
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion=['gini', 'entropy'],
                                              max_depth=[1, 3, 5, 7, 9],
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=[1, 3, 5, 7, 9],
                                              min_samples_split=[2, 4, 6, 8,
                                                                 10],
                                              min_weight_fractio...
                                   

### 一番良かったパラメータで学習

In [10]:
gs.best_score_

0.8558823529411764

In [11]:
gs.best_params_

{'criterion': 'entropy',
 'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 100,
 'random_state': 42}

In [12]:
RF = RandomForestClassifier(**gs.best_params_)

In [13]:
RF = RF.fit(train_X, train_y)

In [14]:
RF.feature_importances_

array([0.03277241, 0.20789753, 0.12240468, 0.10384528, 0.14630638,
       0.16503197, 0.05649086, 0.05460433, 0.10350093, 0.00714564])

In [15]:
sorted(
    zip(map(lambda x: round(x, 3), RF.feature_importances_), train.iloc[:, :-1].columns),
    reverse=True)

[(0.208, 'T_Bil'),
 (0.165, 'AST_GOT'),
 (0.146, 'ALT_GPT'),
 (0.122, 'D_Bil'),
 (0.104, 'ALP'),
 (0.104, 'AG_ratio'),
 (0.056, 'TP'),
 (0.055, 'Alb'),
 (0.033, 'Age'),
 (0.007, 'Gender_Male')]

In [16]:
# 訓練済みの決定木を視覚化
# dot_data = tree.export_graphviz(RF, out_file=None,
#                                feature_names=train.iloc[:, 1:].columns,
#                                class_names=train.Survived.name,
#                                rounded=True,
#                                filled=True,
#                                special_characters=True)

### テストデータで予測

In [17]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, :-1].values
test_y = test.disease.values

In [18]:
test_x.shape, test_y.shape

((170, 10), (170,))

In [19]:
pred_y = RF.predict(test_x)

In [20]:
confusion_matrix(test_y, pred_y)

array([[89,  4],
       [14, 63]], dtype=int64)

In [21]:
accuracy_score(test_y, pred_y)

0.8941176470588236

### 検証データで予測

In [22]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk1')

In [23]:
valid.shape

(350, 10)

In [31]:
# ID の保存
# valid_pass = valid.id.values

In [24]:
# valid_X = valid.iloc[:, 1:]
valid_X = valid.copy()

In [25]:
valid_X.describe()

Unnamed: 0,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,Gender_Male
count,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
mean,46.702857,1.631836,0.565918,282.23407,32.363522,61.545425,7.089844,3.5625,1.154297,0.814286
std,16.166318,2.857422,1.74707,201.307434,83.888397,114.916496,0.87793,0.607422,0.248047,0.389433
min,6.0,0.609863,0.053864,175.747528,7.862773,11.278741,4.96875,2.296875,0.668945,0.0
25%,32.0,0.787109,0.147705,214.211426,13.551174,20.84291,6.724609,3.12793,1.008789,1.0
50%,48.0,0.844971,0.193848,220.738617,16.449139,25.971273,6.931641,3.621094,1.216797,1.0
75%,61.0,0.973877,0.23645,231.839767,22.760056,52.746082,7.553711,3.739258,1.288086,1.0
max,75.0,27.046875,17.703125,2101.145752,860.919067,705.777161,8.75,5.007812,1.804688,1.0


In [26]:
valid_X.shape, train_X.shape

((350, 10), (680, 10))

In [27]:
pred_valid_y = RF.predict(valid_X)

In [28]:
pred_valid_y.shape

(350,)

In [30]:
# type(valid_pass), type(pred_valid_y)

In [31]:
result_df = pd.DataFrame(pred_valid_y)

In [32]:
result_df.to_csv("./RF_2.csv", header=False)