In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import lightgbm as lgb

from sklearn.metrics import mean_squared_error, confusion_matrix 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, metrics


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('../001/train.pk1')

In [3]:
train_pkl.shape

(242150, 10)

In [4]:
# object 型は削除
train_pkl.drop(['grade', 'purpose'], axis=1, inplace=True)

In [5]:
train_pkl.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,employment_length,credit_score,application_type,loan_status
0,0,609.296068,3,8.421982,0,714.061803,0,0
1,1,1183.266999,5,10.286776,10,697.706701,0,1
2,2,695.783256,3,14.723425,1,656.419357,0,0
3,3,738.392546,3,14.260708,0,657.906852,0,0
4,4,1642.400654,5,25.217452,10,662.972297,0,0


In [6]:
# train_pk2 = train_pkl.copy()
train_pkl.shape

(242150, 8)

## 訓練データとテストデータに分割

In [7]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [8]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:-1]
train_y = train.loan_status

In [9]:
# ターゲットと特徴量の分割
test_X = test.iloc[:, 1:-1]
test_y = test.loan_status

In [None]:
# LightGBMのモデルへデータセットの初期化セット
train_data = lgb.Dataset(train_X, label=train_y)
test_data =  lgb.Dataset(test_X, label=test_y, reference=train_data)

## LightGBM

In [None]:
# パラメータ
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'verbose': 2
}

In [None]:
# 訓練
gbm = lgb.train(
    params,
    train_data,
    valid_sets=test_data,
    num_boost_round=1000,
    verbose_eval=50,
    early_stopping_rounds=100
)


In [10]:
# scikit-learn経由で
gbm2 = lgb.LGBMClassifier(objective='binary', n_estimators=1000, seed=42)

In [11]:
gbm2.fit(train_X, train_y, eval_set=[(test_X, test_y)], verbose=50)

[50]	valid_0's binary_logloss: 0.421087
[100]	valid_0's binary_logloss: 0.421024
[150]	valid_0's binary_logloss: 0.421117
[200]	valid_0's binary_logloss: 0.421441
[250]	valid_0's binary_logloss: 0.421689
[300]	valid_0's binary_logloss: 0.421997
[350]	valid_0's binary_logloss: 0.422384
[400]	valid_0's binary_logloss: 0.422841
[450]	valid_0's binary_logloss: 0.423102
[500]	valid_0's binary_logloss: 0.423315
[550]	valid_0's binary_logloss: 0.423545
[600]	valid_0's binary_logloss: 0.423878
[650]	valid_0's binary_logloss: 0.424244
[700]	valid_0's binary_logloss: 0.424572
[750]	valid_0's binary_logloss: 0.424771
[800]	valid_0's binary_logloss: 0.425156
[850]	valid_0's binary_logloss: 0.425546
[900]	valid_0's binary_logloss: 0.425966
[950]	valid_0's binary_logloss: 0.426373
[1000]	valid_0's binary_logloss: 0.42677


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=42,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [14]:
# preds = gbm2.predict(test_X, num_iteration=gbm2.best_iteration)
preds = gbm2.predict(test_X)


In [15]:
preds = gbm2.predict_proba(test_X)

In [16]:
y_pred = np.where(preds < 0.5, 0, 1)

In [17]:
pred_y = np.argmax(y_pred, axis=1)

In [18]:
confusion_matrix(test_y, pred_y)

array([[39309,   591],
       [ 7762,   768]], dtype=int64)

In [19]:
test_y.sum(), pred_y.sum()

(8530, 1359)

In [20]:
accuracy_score(test_y, pred_y)

0.8275242618211852

In [21]:
precision_score(test_y, pred_y)

0.565121412803532

In [22]:
recall_score(test_y, pred_y)

0.09003516998827667

In [23]:
f1_score(test_y, pred_y)

0.15532409748205075

In [None]:
roc_auc_score(test_y, pred_y)

In [None]:
fpr, tpr, thr = roc_curve(test_y, pred_y)

In [None]:
auc = metrics.auc(fpr, tpr)

In [None]:
auc

In [None]:
plt.plot(fpr, tpr, label="ROC Curve %.2f"%auc)
plt.legend()
plt.xlabel('FPR')
plt.ylabel("TPR")
plt.grid()
plt.show()

## 検証データで実行

In [24]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk1')

In [25]:
# ID の保存
valid_pass = valid.id.values

In [26]:
# valid_X = valid.iloc[:, 1:]
valid_X = valid.copy()

In [30]:
valid_X.drop(['id', 'grade', 'purpose'], axis=1, inplace=True)

In [31]:
valid_X.shape, train_X.shape

((26900, 6), (193720, 6))

In [32]:
pred_valid_y = gbm2.predict(valid_X)

In [33]:
pred_valid_y.shape

(26900,)

In [34]:
preds = gbm2.predict_proba(valid_X)

In [35]:
y_pred = np.where(preds < 0.5, 0, 1)

In [36]:
pred_y = np.argmax(y_pred, axis=1)

In [37]:
# type(valid_pass), type(pred_valid_y)

In [39]:
result_df = pd.DataFrame(pred_y, valid_pass, columns=['y'])

In [40]:
result_df.to_csv("./LGBM_1.csv", header=False)

In [None]:
result_df