## XGBデフォルト

In [1]:
import numpy as np
import pandas as pd

import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, make_scorer, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk3')

In [3]:
train_pkl.shape

(850, 10)

In [None]:
# 判断にあまり使用しなかった下位３つ削除
# TP, Alb, Age を削除

In [4]:
train_pkl = train_pkl.drop(['TP', 'Alb', 'Age'], axis=1)

In [5]:
train_pkl.shape

(850, 7)

In [None]:
# 相関の高く、判断の重要度の低いほうを削除
# ALT_GPT, D_Bil

In [6]:
train_pkl = train_pkl.drop(['ALT_GPT', 'D_Bil'], axis=1)

In [7]:
train_pkl.shape

(850, 5)

In [8]:
train_pkl.head()

Unnamed: 0,T_Bil,ALP,AST_GOT,AG_ratio,disease
0,0.787109,220.178696,21.729246,1.006836,0
1,1.003906,221.218414,64.752319,0.751465,1
2,0.657227,320.770538,30.61318,0.774902,0
3,0.906738,369.278168,54.510086,0.988281,1
4,1.735352,222.782028,170.010178,1.026367,0


## 訓練データとテストデータに分割

In [87]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [88]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, :-1]
train_y = train.disease.values

In [89]:
train_X.head(5)

Unnamed: 0,T_Bil,ALP,AST_GOT,AG_ratio
332,2.066406,213.935623,47.154297,0.759766
383,0.817871,214.644638,21.059477,1.291016
281,0.791992,358.339508,25.77248,1.319336
2,0.657227,320.770538,30.61318,0.774902
231,1.714844,215.885971,60.709866,1.21875


## 訓練データで訓練

In [225]:
XGB = XGBClassifier(reg_lambda=3, max_depth=7)

In [226]:
XGB = XGB.fit(train_X, train_y)

In [227]:
XGB.feature_importances_

array([0.44323507, 0.11708284, 0.29659826, 0.14308383], dtype=float32)

In [228]:
sorted(
    zip(map(lambda x: round(x, 3), XGB.feature_importances_), train.iloc[:, :-1].columns),
    reverse=True)

[(0.443, 'T_Bil'), (0.297, 'AST_GOT'), (0.143, 'AG_ratio'), (0.117, 'ALP')]

## テストデータで実行

In [229]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, :-1]
test_y = test.disease.values

In [230]:
test_x.shape, test_y.shape

((170, 4), (170,))

In [231]:
pred_y = XGB.predict(test_x)

In [232]:
confusion_matrix(test_y, pred_y)

array([[86,  7],
       [11, 66]], dtype=int64)

In [233]:
accuracy_score(test_y, pred_y)

0.8941176470588236

## 検証データで実行

In [21]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [22]:
valid.shape

(350, 9)

In [23]:
type(valid)

pandas.core.frame.DataFrame

In [24]:
# ID の保存
# valid_pass = valid.id.values

In [25]:
# valid_X = valid.iloc[:, 1:]
valid_X = valid.copy()

In [26]:
valid_X.shape, train_X.shape

((350, 9), (680, 4))

In [None]:
# 判断にあまり使用しなかった下位３つ削除
# TP, Alb, Age を削除

In [27]:
valid_X = valid_X.drop(['TP', 'Alb', 'Age'], axis=1)

In [28]:
valid_X.shape

(350, 6)

In [29]:
# 相関の高く、判断の重要度の低いほうを削除
# ALT_GPT, D_Bil

In [30]:
valid_X = valid_X.drop(['ALT_GPT', 'D_Bil'], axis=1)

In [31]:
valid_X.shape

(350, 4)

In [33]:
valid_X.head()

Unnamed: 0,T_Bil,ALP,AST_GOT,AG_ratio
0,0.801758,214.448685,19.496649,1.288086
1,0.834961,234.13681,16.733311,1.030273
2,0.791992,216.039902,20.695866,1.27832
3,0.833984,200.901123,20.102516,1.275391
4,0.69043,208.705841,25.096685,1.280273


In [34]:
pred_valid_y = XGB.predict(valid_X)

In [35]:
pred_valid_y.shape

(350,)

In [36]:
# type(valid_pass), type(pred_valid_y)

In [37]:
result_df = pd.DataFrame(pred_valid_y)

In [38]:
result_df.to_csv("./XGB_5.csv", header=False)