## XGBデフォルト

In [1]:
import numpy as np
import pandas as pd

import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split


In [3]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk4')

In [4]:
train_pkl.shape

(72326, 10)

## 訓練データとテストデータに分割

In [5]:
# object 型削除
train_pkl.drop(['grade', 'purpose'], axis=1, inplace=True)

In [6]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [7]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:-1]
train_y = train.loan_status

## 訓練データで訓練

In [8]:
train_X.shape, train_y.shape

((57860, 6), (57860,))

In [9]:
XGB = XGBClassifier()

In [10]:
XGB = XGB.fit(train_X, train_y)

In [11]:
XGB.feature_importances_

array([0.03817088, 0.17598003, 0.6316628 , 0.07727711, 0.04400125,
       0.03290791], dtype=float32)

In [12]:
sorted(
    zip(map(lambda x: round(x, 3), XGB.feature_importances_), train_X.columns),
    reverse=True)

[(0.632, 'interest_rate'),
 (0.176, 'term'),
 (0.077, 'employment_length'),
 (0.044, 'credit_score'),
 (0.038, 'loan_amnt'),
 (0.033, 'application_type')]

## テストデータで実行

In [13]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:-1]
test_y = test.loan_status

In [14]:
test_x.shape, test_y.shape, test.shape

((14466, 6), (14466,), (14466, 8))

In [15]:
pred_y = XGB.predict(test_x)

In [16]:
confusion_matrix(test_y, pred_y)

array([[2351, 3590],
       [1203, 7322]], dtype=int64)

In [17]:
test_y.sum(), pred_y.sum()

(8525, 10912)

In [18]:
accuracy_score(test_y, pred_y)

0.6686713673441173

In [19]:
precision_score(test_y, pred_y)

0.6710043988269795

In [20]:
recall_score(test_y, pred_y)

0.8588856304985337

In [21]:
f1_score(test_y, pred_y)

0.7534084478057315

## 検証データで実行

In [22]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk4')

In [23]:
valid.shape

(26900, 9)

In [24]:
# ID の保存
valid_pass = valid.id.values

In [25]:
# valid_X = valid.iloc[:, 1:]
valid_X = valid.copy()

In [26]:
valid_X.drop(['id', 'grade', 'purpose'], axis=1, inplace=True)

In [27]:
valid_X.shape, train_X.shape

((26900, 6), (57860, 6))

In [28]:
pred_valid_y = XGB.predict(valid_X)

In [29]:
pred_valid_y.shape

(26900,)

In [30]:
# type(valid_pass), type(pred_valid_y)

In [31]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [32]:
result_df.to_csv("./XGB_4_1.csv", header=False)