In [1]:
#ライブラリ
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn import metrics

#検証方法：クロスバリデーション
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#モデル
from xgboost import XGBClassifier

In [5]:
#データ準備
train =pd.read_table('data/train.tsv', index_col=0)
test = pd.read_table('data/test.tsv',index_col=0)
sample = pd.read_table('data/sample_submit.tsv' , index_col=0, header=None)

In [6]:
#データの確認と前処理　テストデータ
print(train.shape)
train.head()

(445, 8)


Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
7,0,3,male,2.0,3,1,21.075,S
9,1,2,female,14.0,1,0,30.0708,C
11,1,1,female,58.0,0,0,26.55,S


In [7]:
#説明変数と目的変数に分ける
train_x = train.drop('survived',axis = 1)
y = train.iloc[:,0]
print(train_x.shape)
print(y.shape)

(445, 7)
(445,)


In [6]:
#trainデータ：one-hot-encoding
train_ohe = pd.get_dummies(train_x,dummy_na=True)
train_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,1,35.0,1,0,53.1,1,0,0,0,0,1,0
4,3,35.0,0,0,8.05,0,1,0,0,0,1,0
7,3,2.0,3,1,21.075,0,1,0,0,0,1,0
9,2,14.0,1,0,30.0708,1,0,0,1,0,0,0
11,1,58.0,0,0,26.55,1,0,0,0,0,1,0


#trainデータ：欠損値の補完
#インピュータークラスのインスタンス化と（列平均）の学習
imp = SimpleImputer()
imp.fit(train_ohe)

#学習済みimputerの適用：各列欠損値の置換
train_fin = pd.DataFrame(imp.transform(train_ohe) , columns = train_ohe.columns.values)

#結果表示
display(train_fin.iloc[10:20,1])

In [7]:
#testデータ：one_hot_encoding
test_ohe = pd.get_dummies(test,dummy_na=True)
test_ohe.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,sex_female,sex_male,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3,22.0,1,0,7.25,0,1,0,0,0,1,0
1,1,38.0,1,0,71.2833,1,0,0,1,0,0,0
2,3,26.0,0,0,7.925,1,0,0,0,0,1,0
5,3,,0,0,8.4583,0,1,0,0,1,0,0
6,1,54.0,0,0,51.8625,0,1,0,0,0,1,0


In [8]:
#columnsのデータ項目を確認
cols_train = set(train_ohe.columns.values)
cols_test = set(test_ohe.columns.values)

#trainにあってtestにないデータ項目
diff1 = cols_train -cols_test
print('trainのみ：%s' % diff1)

#testにあってtrainにないデータ項目
diff2 = cols_test - cols_train
print('testのみ：%s' % diff2)

trainのみ：set()
testのみ：set()


In [9]:
#holdoutを利用してtrainデータを分割
x_train , x_test , y_train,y_test = train_test_split(train_ohe , y , test_size=0.2)

In [47]:
#XGBoostClassifierのインスタンス作成
model = XGBClassifier(n_estimators = 500 , learning_rate = 0.01 ,max_depth = 7, early_stopping_rounds = 10)

#モデルフィッティング
eval_set = [(x_test,y_test)]
model.fit(x_train , y_train , eval_set=eval_set , verbose=True)

[0]	validation_0-logloss:0.68889
[1]	validation_0-logloss:0.68483
[2]	validation_0-logloss:0.68084
[3]	validation_0-logloss:0.67695
[4]	validation_0-logloss:0.67310
[5]	validation_0-logloss:0.66937
[6]	validation_0-logloss:0.66576
[7]	validation_0-logloss:0.66222
[8]	validation_0-logloss:0.65875
[9]	validation_0-logloss:0.65532
[10]	validation_0-logloss:0.65200
[11]	validation_0-logloss:0.64875
[12]	validation_0-logloss:0.64557
[13]	validation_0-logloss:0.64241
[14]	validation_0-logloss:0.63936
[15]	validation_0-logloss:0.63638
[16]	validation_0-logloss:0.63345
[17]	validation_0-logloss:0.63055
[18]	validation_0-logloss:0.62774
[19]	validation_0-logloss:0.62500
[20]	validation_0-logloss:0.62231
[21]	validation_0-logloss:0.61968
[22]	validation_0-logloss:0.61706
[23]	validation_0-logloss:0.61431
[24]	validation_0-logloss:0.61183
[25]	validation_0-logloss:0.60917
[26]	validation_0-logloss:0.60656
[27]	validation_0-logloss:0.60404
[28]	validation_0-logloss:0.60153
[29]	validation_0-loglos

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=10, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=500,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [46]:
model.best_score

0.4958624300662051

In [37]:
y_pred = model.predict(x_test)
metrics.accuracy_score(y_test,y_pred)

0.797752808988764

#提出用関数
pred = model.predict_proba(test_ohe)[:,1]

sample[1] = pred
sample.to_csv(f'submit/xgboost2.tsv',sep='\t', header=None)