In [35]:
import numpy as np
import pandas as pd

import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split


In [36]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./pd_train.pk2')

In [37]:
train_pkl.shape

(891, 15)

In [38]:
# Age, Fare は削除
# ダミー変数の先頭は削除
train_pkl.drop(['Age', 'Fare', 'Sex_female', 'Embarked_C', 'Pclass_1'], inplace=True, axis=1)

In [39]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [40]:
# ターゲットと特徴量の分割
# train_X = train.iloc[:, 1:].values
# train_y = train.Survived.values

In [41]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:]
train_y = train.Survived

In [42]:
XGB = XGBClassifier()

In [43]:
XGB = XGB.fit(train_X, train_y)

In [44]:
XGB.feature_importances_

array([0.06961414, 0.01772963, 0.04255553, 0.0355866 , 0.50837743,
       0.02491442, 0.05818443, 0.03021946, 0.21281832], dtype=float32)

In [45]:
sorted(
    zip(map(lambda x: round(x, 3), XGB.feature_importances_), train.iloc[:, 1:].columns),
    reverse=True)

[(0.508, 'Sex_male'),
 (0.213, 'Pclass_3'),
 (0.07, 'SibSp'),
 (0.058, 'Embarked_S'),
 (0.043, 'Age_bin'),
 (0.036, 'Fare_bin'),
 (0.03, 'Pclass_2'),
 (0.025, 'Embarked_Q'),
 (0.018, 'Parch')]

In [15]:
# ターゲットと特徴量の分割
# test_x = test.iloc[:, 1:].values
# test_y = test.Survived.values

In [46]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:]
test_y = test.Survived

In [47]:
test_x.shape, test_y.shape

((179, 9), (179,))

In [48]:
pred_y = XGB.predict(test_x)

In [49]:
confusion_matrix(test_y, pred_y)

array([[95, 10],
       [23, 51]], dtype=int64)

In [50]:
accuracy_score(test_y, pred_y)

0.8156424581005587

In [51]:
# 検証データ読み込み
valid = pd.read_pickle('./pd_test.pk2')

In [52]:
valid.shape

(418, 15)

In [54]:
# ID の保存
valid_pass = valid.PassengerId.values

In [55]:
valid_X = valid.iloc[:, 1:]

In [56]:
valid_X.drop(['Age', 'Fare', 'Sex_female', 'Embarked_C', 'Pclass_1'], inplace=True, axis=1)

In [57]:
valid_X.shape, train_X.shape

((418, 9), (712, 9))

In [58]:
# valid_X_2 = valid_X.rename(columns={'SibSp':'f0'})

In [59]:
# valid_X_2.rename(columns={'Parch':'f1', 'Age_bin':'f2', 'Fare_bin':'f3', 'Sex_male':'f4', 'Embarked_Q':'f5', 'Embarked_S':'f6', 'Pclass_2':'f7', 'Pclass_3':'f8'}, inplace=True)

In [60]:
pred_valid_y = XGB.predict(valid_X)

In [61]:
pred_valid_y.shape

(418,)

In [62]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [63]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['Survived'])

In [64]:
result_df.to_csv("./XGB_1.csv", index_label='PassengerId')