In [14]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


In [15]:

NUM_FOLD = 5

train = pd.read_csv('../dataset/train.csv', index_col='id')
test = pd.read_csv('../dataset/test.csv', index_col='id')
original = pd.read_csv('../dataset/original.csv')
sample_submission = pd.read_csv('../dataset/sample_submission.csv')



In [16]:

cat_cols = list(test.select_dtypes(include=['object']).columns)

for df in [train, test, original]:
    print(df.columns)
    for col in cat_cols:  
        df[col] = df[col].astype('str').astype('category') 


Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length', 'loan_status'],
      dtype='object')
Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length'],
      dtype='object')
Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')


In [17]:

X = train.drop(['loan_status'], axis=1)
y = train['loan_status']
X_original = original.drop(['loan_status'], axis=1)
y_original = original['loan_status']




In [20]:

val_scores = []
test_preds_model = []

skf = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=1)

params_xgb = {

    'enable_categorical': True,
    'random_state': 1,
    'n_estimators': 10000,
    'learning_rate': 0.01,
    'colsample_bytree': 0.6,
    'reg_lambda': 0.01,
    'max_depth': 4,
    'max_bin': 5000,
    'subsample': 0.95,
    'reg_alpha': 0.1,
}

for Fold, (train_index, val_index) in enumerate(skf.split(X, y)):

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]

    X_train = pd.concat([X_train, X_original], axis=0)
    y_train = pd.concat([y_train, y_original]) 

    model = XGBClassifier(**params_xgb)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]

    roc_auc_score_ = roc_auc_score(y_val, y_pred)

    print(f'Fold {Fold}: roc_auc_score= {roc_auc_score_:.5f}')

    val_scores.append(roc_auc_score_)

    test_preds_model.append(model.predict_proba(test)[:, 1])

test_preds_model = sum(test_preds_model)/len(test_preds_model)

print(f'mean validation roc_auc_score = {np.mean(val_scores):.5f}')
print(f'std validation roc_auc_score = {np.std(val_scores):.5f}')


sample_submission['loan_status'] = test_preds_model
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head()


Fold 0: roc_auc_score= 0.96612
Fold 1: roc_auc_score= 0.96461
Fold 2: roc_auc_score= 0.96702
Fold 3: roc_auc_score= 0.96587
Fold 4: roc_auc_score= 0.96444
mean validation roc_auc_score = 0.96561
std validation roc_auc_score = 0.00097


Unnamed: 0,id,loan_status
0,58645,0.999589
1,58646,0.031124
2,58647,0.518716
3,58648,0.007152
4,58649,0.048913


In [21]:
from lightgbm import LGBMClassifier
val_scores = []
test_preds_model = []

skf = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=1)

params_lgb = {
    'objective': 'binary',
    'metric': 'auc',
    'random_state': 42,
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}


for Fold, (train_index, val_index) in enumerate(skf.split(X, y)):

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]

    X_train = pd.concat([X_train, X_original], axis=0)
    y_train = pd.concat([y_train, y_original]) 

    model = LGBMClassifier(**params_lgb)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]

    roc_auc_score_ = roc_auc_score(y_val, y_pred)

    print(f'Fold {Fold}: roc_auc_score= {roc_auc_score_:.5f}')

    val_scores.append(roc_auc_score_)

    test_preds_model.append(model.predict_proba(test)[:, 1])

test_preds_model = sum(test_preds_model)/len(test_preds_model)

print(f'mean validation roc_auc_score = {np.mean(val_scores):.5f}')
print(f'std validation roc_auc_score = {np.std(val_scores):.5f}')


sample_submission['loan_status'] = test_preds_model
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head()


[LightGBM] [Info] Number of positive: 13788, number of negative: 65709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 895
[LightGBM] [Info] Number of data points in the train set: 79497, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.173441 -> initscore=-1.561437
[LightGBM] [Info] Start training from score -1.561437
Fold 0: roc_auc_score= 0.96281
[LightGBM] [Info] Number of positive: 13788, number of negative: 65709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 916
[LightGBM] [Info] Number of data points in the train set: 79497, number of used fe

Unnamed: 0,id,loan_status
0,58645,0.999528
1,58646,0.02652
2,58647,0.490827
3,58648,0.007731
4,58649,0.05205
