In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import lightgbm as lgb

from sklearn.metrics import mean_squared_error, confusion_matrix 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, metrics


In [3]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk3')

In [4]:
train_pkl.shape

(92305, 10)

In [5]:
# object 型は削除
train_pkl.drop(['grade', 'purpose'], axis=1, inplace=True)

In [6]:
train_pkl.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,employment_length,credit_score,application_type,loan_status
21860,21860,600.756791,3,15.113792,10,657.604689,0,0
32628,32628,525.614417,3,22.525596,10,660.326798,0,0
174851,174851,651.834264,3,18.455408,2,686.820106,0,1
235341,235341,618.351551,3,11.915945,10,664.691527,0,0
94315,94315,679.590903,3,17.419674,10,656.75438,0,0


In [7]:
# train_pk2 = train_pkl.copy()
train_pkl.shape

(92305, 8)

## 訓練データとテストデータに分割

In [8]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [9]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:-1]
train_y = train.loan_status

In [10]:
# ターゲットと特徴量の分割
test_X = test.iloc[:, 1:-1]
test_y = test.loan_status

In [11]:
# LightGBMのモデルへデータセットの初期化セット
train_data = lgb.Dataset(train_X, label=train_y)
test_data =  lgb.Dataset(test_X, label=test_y, reference=train_data)

## LightGBM

In [None]:
# パラメータ
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'verbose': 2
}

In [None]:
# 訓練
gbm = lgb.train(
    params,
    train_data,
    valid_sets=test_data,
    num_boost_round=1000,
    verbose_eval=50,
    early_stopping_rounds=100
)


In [12]:
# scikit-learn経由で
gbm2 = lgb.LGBMClassifier(objective='binary', n_estimators=1000, seed=42)

In [13]:
gbm2.fit(train_X, train_y, eval_set=[(test_X, test_y)], verbose=50)

[50]	valid_0's binary_logloss: 0.614471
[100]	valid_0's binary_logloss: 0.614606
[150]	valid_0's binary_logloss: 0.615265
[200]	valid_0's binary_logloss: 0.615996
[250]	valid_0's binary_logloss: 0.616699
[300]	valid_0's binary_logloss: 0.617534
[350]	valid_0's binary_logloss: 0.618131
[400]	valid_0's binary_logloss: 0.6188
[450]	valid_0's binary_logloss: 0.619497
[500]	valid_0's binary_logloss: 0.620539
[550]	valid_0's binary_logloss: 0.621543
[600]	valid_0's binary_logloss: 0.622468
[650]	valid_0's binary_logloss: 0.623611
[700]	valid_0's binary_logloss: 0.624704
[750]	valid_0's binary_logloss: 0.625762
[800]	valid_0's binary_logloss: 0.626803
[850]	valid_0's binary_logloss: 0.627386
[900]	valid_0's binary_logloss: 0.628264
[950]	valid_0's binary_logloss: 0.629189
[1000]	valid_0's binary_logloss: 0.629771


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=42,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [14]:
# preds = gbm2.predict(test_X, num_iteration=gbm2.best_iteration)
preds = gbm2.predict(test_X)


In [15]:
preds = gbm2.predict_proba(test_X)

In [16]:
y_pred = np.where(preds < 0.5, 0, 1)

In [17]:
pred_y = np.argmax(y_pred, axis=1)

In [18]:
confusion_matrix(test_y, pred_y)

array([[6935, 2994],
       [3778, 4754]], dtype=int64)

In [19]:
test_y.sum(), pred_y.sum()

(8532, 7748)

In [20]:
accuracy_score(test_y, pred_y)

0.6331726342018309

In [21]:
precision_score(test_y, pred_y)

0.6135776974703149

In [22]:
recall_score(test_y, pred_y)

0.5571964369432724

In [23]:
f1_score(test_y, pred_y)

0.584029484029484

In [None]:
roc_auc_score(test_y, pred_y)

In [None]:
fpr, tpr, thr = roc_curve(test_y, pred_y)

In [None]:
auc = metrics.auc(fpr, tpr)

In [None]:
auc

In [None]:
plt.plot(fpr, tpr, label="ROC Curve %.2f"%auc)
plt.legend()
plt.xlabel('FPR')
plt.ylabel("TPR")
plt.grid()
plt.show()

## 検証データで実行

In [24]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [25]:
# ID の保存
valid_pass = valid.id.values

In [26]:
# valid_X = valid.iloc[:, 1:]
valid_X = valid.copy()

In [27]:
valid_X.drop(['id', 'grade', 'purpose'], axis=1, inplace=True)

In [28]:
valid_X.shape, train_X.shape

((26900, 6), (73844, 6))

In [29]:
pred_valid_y = gbm2.predict(valid_X)

In [30]:
pred_valid_y.shape

(26900,)

In [31]:
preds = gbm2.predict_proba(valid_X)

In [32]:
y_pred = np.where(preds < 0.5, 0, 1)

In [33]:
pred_y = np.argmax(y_pred, axis=1)

In [34]:
# type(valid_pass), type(pred_valid_y)

In [35]:
result_df = pd.DataFrame(pred_y, valid_pass, columns=['y'])

In [36]:
result_df.to_csv("./LGBM_3_1.csv", header=False)

In [None]:
result_df