## XGBデフォルト

In [1]:
import numpy as np
import pandas as pd

import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, make_scorer, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split


In [185]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('../001/train.pk1')

In [186]:
train_pkl.shape

(850, 11)

In [187]:
# T_Bilと D_Bil、AST_GOT と ALT_GPT の積を追加

In [188]:
# train_pkl['TD_Bil'] = train_pkl['T_Bil'] + train_pkl['D_Bil']
train_pkl['AST__ALT_GOT'] = train_pkl['AST_GOT'] * train_pkl['ALT_GPT']

In [189]:
# 判断にあまり使用しなかった下位３つ削除
# TP, Alb, Age を削除

In [190]:
# train_pkl = train_pkl.drop(['TP', 'Alb', 'Age'], axis=1)

In [191]:
train_pkl.shape

(850, 12)

In [192]:
# 相関の高く、判断の重要度の低いほうを削除
# ALT_GPT, D_Bil

In [193]:
# train_pkl = train_pkl.drop(['ALT_GPT', 'D_Bil'], axis=1)

In [194]:
train_pkl.shape

(850, 12)

In [195]:
train_pkl.head()

Unnamed: 0,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,Gender_Male,disease,AST__ALT_GOT
0,59,0.787109,0.150513,220.178696,13.467617,21.729246,6.816406,3.111328,1.006836,1,0,292.641174
1,69,1.003906,0.195679,221.218414,51.033463,64.752319,6.890625,3.050781,0.751465,1,1,3304.535156
2,65,0.657227,0.081299,320.770538,12.625011,30.61318,5.949219,2.488281,0.774902,1,0,386.49176
3,65,0.906738,0.214233,369.278168,34.347599,54.510086,6.96875,3.613281,0.988281,1,1,1872.290527
4,22,1.735352,0.197754,222.782028,20.572891,170.010178,5.835938,3.068359,1.026367,0,0,3497.60083


In [196]:
train_pkl = train_pkl.iloc[:, [0, 1, 2 ,3 ,4, 5, 6, 7, 8, 9, 11, 10]]

In [198]:
train_pkl.head()

Unnamed: 0,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,Gender_Male,AST__ALT_GOT,disease
0,59,0.787109,0.150513,220.178696,13.467617,21.729246,6.816406,3.111328,1.006836,1,292.641174,0
1,69,1.003906,0.195679,221.218414,51.033463,64.752319,6.890625,3.050781,0.751465,1,3304.535156,1
2,65,0.657227,0.081299,320.770538,12.625011,30.61318,5.949219,2.488281,0.774902,1,386.49176,0
3,65,0.906738,0.214233,369.278168,34.347599,54.510086,6.96875,3.613281,0.988281,1,1872.290527,1
4,22,1.735352,0.197754,222.782028,20.572891,170.010178,5.835938,3.068359,1.026367,0,3497.60083,0


## 訓練データとテストデータに分割

In [199]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [200]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, :-1]
train_y = train.disease.values

In [201]:
train_X.head(5)

Unnamed: 0,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,Gender_Male,AST__ALT_GOT
332,44,2.066406,0.631348,213.935623,14.563273,47.154297,7.503906,2.302734,0.759766,1,686.720947
383,36,0.817871,0.197021,214.644638,15.622564,21.059477,7.511719,3.630859,1.291016,1,329.003021
281,20,0.791992,0.082642,358.339508,12.924613,25.77248,8.648438,4.324219,1.319336,1,333.099335
2,65,0.657227,0.081299,320.770538,12.625011,30.61318,5.949219,2.488281,0.774902,1,386.49176
231,29,1.714844,0.51123,215.885971,27.66971,60.709866,5.953125,3.117188,1.21875,1,1679.824341


## 訓練データで訓練

In [292]:
XGB = XGBClassifier(lambda_l1=300, lamba_l2=15, max_depth=9)

In [293]:
XGB = XGB.fit(train_X, train_y)

In [294]:
XGB.feature_importances_

array([0.02373545, 0.13246605, 0.05340776, 0.0432181 , 0.04356578,
       0.05033004, 0.04938363, 0.03052111, 0.05881911, 0.02625548,
       0.48829755], dtype=float32)

In [295]:
sorted(
    zip(map(lambda x: round(x, 3), XGB.feature_importances_), train_X.iloc[:, :].columns),
    reverse=True)

[(0.488, 'AST__ALT_GOT'),
 (0.132, 'T_Bil'),
 (0.059, 'AG_ratio'),
 (0.053, 'D_Bil'),
 (0.05, 'AST_GOT'),
 (0.049, 'TP'),
 (0.044, 'ALT_GPT'),
 (0.043, 'ALP'),
 (0.031, 'Alb'),
 (0.026, 'Gender_Male'),
 (0.024, 'Age')]

## テストデータで実行

In [296]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, :-1]
test_y = test.disease.values

In [297]:
test_x.shape, test_y.shape

((170, 11), (170,))

In [298]:
pred_y = XGB.predict(test_x)

In [299]:
confusion_matrix(test_y, pred_y)

array([[89,  4],
       [14, 63]], dtype=int64)

In [300]:
accuracy_score(test_y, pred_y)

0.8941176470588236

## 検証データで実行

In [None]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [None]:
valid.shape

In [None]:
type(valid)

In [None]:
# ID の保存
# valid_pass = valid.id.values

In [None]:
# valid_X = valid.iloc[:, 1:]
valid_X = valid.copy()

In [None]:
valid_X.shape, train_X.shape

In [None]:
# 判断にあまり使用しなかった下位３つ削除
# TP, Alb, Age を削除

In [None]:
valid_X = valid_X.drop(['TP', 'Alb', 'Age'], axis=1)

In [None]:
valid_X.shape

In [None]:
# 相関の高く、判断の重要度の低いほうを削除
# ALT_GPT, D_Bil

In [None]:
valid_X = valid_X.drop(['ALT_GPT', 'D_Bil'], axis=1)

In [None]:
valid_X.shape

In [None]:
valid_X.head()

In [None]:
pred_valid_y = XGB.predict(valid_X)

In [None]:
pred_valid_y.shape

In [None]:
# type(valid_pass), type(pred_valid_y)

In [None]:
result_df = pd.DataFrame(pred_valid_y)

In [None]:
result_df.to_csv("./XGB_5.csv", header=False)