## ランダムフォレスト　デフォルト

In [20]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import tree


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk1')

In [3]:
train_pkl.shape

(850, 11)

In [4]:
train_pkl.describe()

Unnamed: 0,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,Gender_Male,disease
count,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0
mean,46.610588,1.607422,0.606445,272.211334,41.978474,61.01812,7.03125,3.517578,1.144531,0.825882,0.444706
std,16.453672,2.443359,1.568359,200.348511,145.927368,110.125389,0.84082,0.560059,0.227295,0.379434,0.497226
min,10.0,0.585938,0.034851,163.261841,3.924518,11.283497,4.859375,2.181641,0.626953,0.0,0.0
25%,32.0,0.781372,0.144928,213.991409,13.644659,21.239157,6.730469,3.138672,1.004883,1.0,0.0
50%,48.0,0.835693,0.194092,220.092506,16.638,27.056026,6.914062,3.621094,1.205078,1.0,0.0
75%,62.0,1.197021,0.335388,229.455933,23.056081,56.461569,7.535156,3.712891,1.288086,1.0,1.0
max,78.0,23.015625,17.6875,2108.483643,1423.186523,814.439392,8.742188,5.015625,1.821289,1.0,1.0


## 訓練データとテストデータに分割

In [5]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [6]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, :-1].values
train_y = train.disease.values

## 訓練データで訓練

In [21]:
RF = RandomForestClassifier(random_state=42)

In [22]:
RF = RF.fit(train_X, train_y)

In [23]:
RF.feature_importances_

array([0.03471026, 0.20540482, 0.11689485, 0.10670835, 0.16172461,
       0.16754528, 0.05551939, 0.05348461, 0.09044863, 0.0075592 ])

In [24]:
sorted(
    zip(map(lambda x: round(x, 3), RF.feature_importances_), train.iloc[:, :].columns),
    reverse=True)

[(0.205, 'T_Bil'),
 (0.168, 'AST_GOT'),
 (0.162, 'ALT_GPT'),
 (0.117, 'D_Bil'),
 (0.107, 'ALP'),
 (0.09, 'AG_ratio'),
 (0.056, 'TP'),
 (0.053, 'Alb'),
 (0.035, 'Age'),
 (0.008, 'Gender_Male')]

In [25]:
# 訓練済みの決定木を視覚化
# dot_data = tree.export_graphviz(RF, out_file=None,
#                                feature_names=train.iloc[:, 1:].columns,
#                                class_names=train.Survived.name,
#                                rounded=True,
#                                filled=True,
#                                special_characters=True)

## テストデータで実行

In [26]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, :-1].values
test_y = test.disease.values

In [27]:
test_x.shape, test_y.shape

((170, 10), (170,))

In [28]:
pred_y = RF.predict(test_x)

In [29]:
pred_y.shape

(170,)

In [30]:
test_y[:10], pred_y[:10]

(array([1, 0, 0, 0, 1, 1, 1, 0, 0, 1], dtype=int64),
 array([0, 0, 0, 0, 1, 1, 1, 0, 0, 1], dtype=int64))

In [31]:
confusion_matrix(test_y, pred_y)

array([[89,  4],
       [13, 64]], dtype=int64)

In [33]:
accuracy_score(test_y, pred_y)

0.9

## 検証データで実行

In [34]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk1')

In [35]:
valid.shape

(350, 10)

In [22]:
# ID の保存
# valid_pass = valid.id.values

In [36]:
valid_X = valid.copy()

In [37]:
valid_X.describe()

Unnamed: 0,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,Gender_Male
count,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
mean,46.702857,1.631836,0.565918,282.23407,32.363522,61.545425,7.089844,3.5625,1.154297,0.814286
std,16.166318,2.857422,1.74707,201.307434,83.888397,114.916496,0.87793,0.607422,0.248047,0.389433
min,6.0,0.609863,0.053864,175.747528,7.862773,11.278741,4.96875,2.296875,0.668945,0.0
25%,32.0,0.787109,0.147705,214.211426,13.551174,20.84291,6.724609,3.12793,1.008789,1.0
50%,48.0,0.844971,0.193848,220.738617,16.449139,25.971273,6.931641,3.621094,1.216797,1.0
75%,61.0,0.973877,0.23645,231.839767,22.760056,52.746082,7.553711,3.739258,1.288086,1.0
max,75.0,27.046875,17.703125,2101.145752,860.919067,705.777161,8.75,5.007812,1.804688,1.0


In [38]:
valid_X.shape, train_X.shape

((350, 10), (680, 10))

In [39]:
pred_valid_y = RF.predict(valid_X)

In [40]:
pred_valid_y.shape

(350,)

In [42]:
result_df = pd.DataFrame(pred_valid_y)

In [43]:
result_df.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [44]:
result_df.to_csv("./RF_1.csv", header=False)