In [1]:
import gc
import time
from datetime import date
import pandas as pd
#import pandas_profiling as pdp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from lightgbm import LGBMClassifier

## データ読込

In [4]:
df_train = pd.read_csv("./dataset/input/train.csv")
df_test = pd.read_csv("./dataset/input/test.csv")
df_all = pd.concat([df_train, df_test], sort=False).reset_index(drop=True)
len_train = len(df_train)

## データ探索

## データ前処理

In [5]:
# 対数変換
## ヒストグラムでプロットしたときに、分布に偏りがある項目
df_all["log_balance"] = np.log(df_all.balance - df_all.balance.min() + 1)
df_all["log_duration"] = np.log(df_all.duration + 1)
df_all["log_campaign"] = np.log(df_all.campaign + 1)
df_all["log_pdays"] = np.log(df_all.pdays - df_all.pdays.min() + 1)
df_all = df_all.drop(["balance", "duration", "campaign", "pdays"], axis=1)

In [6]:
# month を文字列から数値に変換
month_dict = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, 
              "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}
df_all["month_int"] = df_all["month"].map(month_dict)

In [7]:
df_all

Unnamed: 0,id,age,job,marital,education,default,housing,loan,contact,day,month,previous,poutcome,y,log_balance,log_duration,log_campaign,log_pdays,month_int
0,0,31,services,married,secondary,no,yes,no,cellular,21,nov,0,other,0.0,9.859640,4.624973,1.386294,6.214608,11
1,1,29,entrepreneur,single,tertiary,no,no,no,cellular,22,aug,0,unknown,1.0,10.817275,5.068904,1.098612,6.556778,8
2,2,35,management,married,tertiary,no,yes,no,cellular,11,nov,0,failure,0.0,9.857444,5.863631,0.693147,6.719013,11
3,3,31,technician,married,secondary,no,yes,yes,unknown,16,may,0,failure,0.0,11.570902,6.490724,1.098612,4.804021,5
4,4,48,unemployed,married,primary,no,yes,no,telephone,3,apr,0,unknown,0.0,10.796571,5.181784,0.693147,5.616771,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45145,18045,49,self-employed,married,tertiary,no,yes,no,cellular,6,jul,0,failure,,11.563666,4.624973,1.098612,6.037871,7
45146,18046,34,blue-collar,married,secondary,no,yes,no,cellular,12,may,0,unknown,,10.504218,5.846439,0.693147,6.705639,5
45147,18047,34,admin.,single,secondary,no,yes,no,unknown,16,may,0,unknown,,11.523945,4.804021,1.098612,5.918894,5
45148,18048,31,technician,single,secondary,no,yes,no,unknown,15,may,0,unknown,,11.189008,5.846439,1.098612,3.761200,5


In [11]:
data_time = df_all.assign(ymd_str=lambda x: '2020' + '-'  + x['month_int'].astype(str) + '-' 
                              + x['day'].astype(str))

In [12]:
data_time

Unnamed: 0,id,age,job,marital,education,default,housing,loan,contact,day,month,previous,poutcome,y,log_balance,log_duration,log_campaign,log_pdays,month_int,ymd_str
0,0,31,services,married,secondary,no,yes,no,cellular,21,nov,0,other,0.0,9.859640,4.624973,1.386294,6.214608,11,2020-11-21
1,1,29,entrepreneur,single,tertiary,no,no,no,cellular,22,aug,0,unknown,1.0,10.817275,5.068904,1.098612,6.556778,8,2020-8-22
2,2,35,management,married,tertiary,no,yes,no,cellular,11,nov,0,failure,0.0,9.857444,5.863631,0.693147,6.719013,11,2020-11-11
3,3,31,technician,married,secondary,no,yes,yes,unknown,16,may,0,failure,0.0,11.570902,6.490724,1.098612,4.804021,5,2020-5-16
4,4,48,unemployed,married,primary,no,yes,no,telephone,3,apr,0,unknown,0.0,10.796571,5.181784,0.693147,5.616771,4,2020-4-3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45145,18045,49,self-employed,married,tertiary,no,yes,no,cellular,6,jul,0,failure,,11.563666,4.624973,1.098612,6.037871,7,2020-7-6
45146,18046,34,blue-collar,married,secondary,no,yes,no,cellular,12,may,0,unknown,,10.504218,5.846439,0.693147,6.705639,5,2020-5-12
45147,18047,34,admin.,single,secondary,no,yes,no,unknown,16,may,0,unknown,,11.523945,4.804021,1.098612,5.918894,5,2020-5-16
45148,18048,31,technician,single,secondary,no,yes,no,unknown,15,may,0,unknown,,11.189008,5.846439,1.098612,3.761200,5,2020-5-15


In [None]:
# 不要な列を削除
df_all = df_all.drop(["month", "day", "month_int"], axis=1)
del data_time
del index

In [11]:
# One Hot Encoding
cat_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "poutcome"]
df_dummy = pd.get_dummies(df_all[cat_cols])

In [12]:
# 分析に必要な列を結合
df_tmp = df_all[["age", "log_balance", "log_duration", "log_campaign", "log_pdays", "y"]]
# "datetime_int"
df = pd.concat([df_tmp, df_dummy], axis=1)

In [13]:
df

Unnamed: 0,age,log_balance,log_duration,log_campaign,log_pdays,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,31,9.859640,4.624973,1.386294,6.214608,0.0,0,0,0,0,...,1,1,0,1,0,0,0,1,0,0
1,29,10.817275,5.068904,1.098612,6.556778,1.0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,1
2,35,9.857444,5.863631,0.693147,6.719013,0.0,0,0,0,0,...,1,1,0,1,0,0,1,0,0,0
3,31,11.570902,6.490724,1.098612,4.804021,0.0,0,0,0,0,...,1,0,1,0,0,1,1,0,0,0
4,48,10.796571,5.181784,0.693147,5.616771,0.0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45145,49,11.563666,4.624973,1.098612,6.037871,,0,0,0,0,...,1,1,0,1,0,0,1,0,0,0
45146,34,10.504218,5.846439,0.693147,6.705639,,0,1,0,0,...,1,1,0,1,0,0,0,0,0,1
45147,34,11.523945,4.804021,1.098612,5.918894,,1,0,0,0,...,1,1,0,0,0,1,0,0,0,1
45148,31,11.189008,5.846439,1.098612,3.761200,,0,0,0,0,...,1,1,0,0,0,1,0,0,0,1


## モデリング

lightGBM を試す  
Kaggle の [Home Credit Default Risk](https://www.kaggle.com/ogrellier/good-fun-with-ligthgbm/code) の kernel を参考にして実装

In [14]:
gc.enable()

In [15]:
# Create Folds
folds = KFold(n_splits=5, shuffle=True, random_state=546789)

In [16]:
data = df[:len_train].drop(["y"], axis=1)
test = df[len_train:].drop(["y"], axis=1)
y = df[:len_train]["y"]

In [17]:
def train_model(data_, test_, y_, folds_):

    oof_preds = np.zeros(data_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    
    feature_importance_df = pd.DataFrame()
    
    feats = [f for f in data_.columns]
    
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
        trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
        val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
        
        clf = LGBMClassifier(
            n_estimators=4000,
            learning_rate=0.03,
            num_leaves=30,
            colsample_bytree=.8,
            subsample=.9,
            max_depth=7,
            reg_alpha=.1,
            reg_lambda=.1,
            min_split_gain=.01,
            min_child_weight=2,
            silent=-1,
            verbose=-1,
        )
        
        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)], 
                eval_metric='auc', verbose=100, early_stopping_rounds=100  #30
               )
        
        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_[feats], num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()
        
    print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) 

    return oof_preds, sub_preds, feature_importance_df

In [18]:
def display_importances(feature_importance_df_):
    # Plot feature importances
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:50].index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(8,10))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

In [19]:
def display_roc_curve(y_, oof_preds_, folds_idx_):
    # Plot ROC curves
    plt.figure(figsize=(6,6))
    scores = [] 
    for n_fold, (_, val_idx) in enumerate(folds_idx_):  
        # Plot the roc curve
        fpr, tpr, thresholds = roc_curve(y_.iloc[val_idx], oof_preds_[val_idx])
        score = roc_auc_score(y_.iloc[val_idx], oof_preds_[val_idx])
        scores.append(score)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.4f)' % (n_fold + 1, score))
    
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8)
    fpr, tpr, thresholds = roc_curve(y_, oof_preds_)
    score = roc_auc_score(y_, oof_preds_)
    plt.plot(fpr, tpr, color='b',
             label='Avg ROC (AUC = %0.4f $\pm$ %0.4f)' % (score, np.std(scores)),
             lw=2, alpha=.8)
    
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('LightGBM ROC Curve')
    plt.legend(loc="lower right")
    plt.tight_layout()
    
    plt.savefig('roc_curve.png')

In [20]:
def display_precision_recall(y_, oof_preds_, folds_idx_):
    # Plot ROC curves
    plt.figure(figsize=(6,6))
    
    scores = [] 
    for n_fold, (_, val_idx) in enumerate(folds_idx_):  
        # Plot the roc curve
        fpr, tpr, thresholds = roc_curve(y_.iloc[val_idx], oof_preds_[val_idx])
        score = average_precision_score(y_.iloc[val_idx], oof_preds_[val_idx])
        scores.append(score)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='AP fold %d (AUC = %0.4f)' % (n_fold + 1, score))
    
    precision, recall, thresholds = precision_recall_curve(y_, oof_preds_)
    score = average_precision_score(y_, oof_preds_)
    plt.plot(precision, recall, color='b',
             label='Avg ROC (AUC = %0.4f $\pm$ %0.4f)' % (score, np.std(scores)),
             lw=2, alpha=.8)
    
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('LightGBM Recall / Precision')
    plt.legend(loc="best")
    plt.tight_layout()
    
    plt.savefig('recall_precision_curve.png')

In [21]:
oof_preds, test_preds, importances = train_model(data, test, y, folds)

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.868044	training's binary_logloss: 0.202359	valid_1's auc: 0.798953	valid_1's binary_logloss: 0.221859
[200]	training's auc: 0.898021	training's binary_logloss: 0.186319	valid_1's auc: 0.805678	valid_1's binary_logloss: 0.218619
[300]	training's auc: 0.918183	training's binary_logloss: 0.174882	valid_1's auc: 0.805694	valid_1's binary_logloss: 0.218492
Early stopping, best iteration is:
[229]	training's auc: 0.904428	training's binary_logloss: 0.18279	valid_1's auc: 0.806505	valid_1's binary_logloss: 0.2182
Fold  1 AUC : 0.806505
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.861908	training's binary_logloss: 0.201048	valid_1's auc: 0.835596	valid_1's binary_logloss: 0.226713
[200]	training's auc: 0.893711	training's binary_logloss: 0.184892	valid_1's auc: 0.839618	valid_1's binary_logloss: 0.222617
[300]	training's auc: 0.913708	training's binary_logloss: 0.17414

In [22]:
df_submission = df_test[["id"]].assign(y=test_preds)
df_submission.to_csv("./dataset/output/pred2.csv", index=False, header=None, encoding="utf-8")

In [19]:
# Display a few graphs
folds_idx = [(trn_idx, val_idx) for trn_idx, val_idx in folds.split(data)]
display_importances(feature_importance_df_=importances)
display_roc_curve(y_=y, oof_preds_=oof_preds, folds_idx_=folds_idx)
display_precision_recall(y_=y, oof_preds_=oof_preds, folds_idx_=folds_idx)