In [37]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

In [3]:
# Import data
train = pd.read_csv('.././data/train.csv')
test = pd.read_csv('.././data/test.csv')

In [5]:
# Target encoder
def target_encoder(df_train, df_test, cols):
    df = pd.concat([df_train, df_test], axis=0, sort=True)
    for col in cols:
        label_map = {}
        for val in df[col].unique():
            label_map.update({val: len(df[df[col]==val])/len(df)})
        df_train[col] = df_train[col].map(label_map)
        df_test[col] = df_test[col].map(label_map)
    return df_train, df_test
 
train, test = target_encoder(train, test, ['SEX', 'EDUCATION', 'MARRIAGE'])

In [6]:
train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,25150,360000,0.603733,0.467667,0.532133,25,-1,-1,-1,-1,...,3435,8870,2020,12590,3479,3446,8870,2020,27043,0
1,13470,240000,0.603733,0.352833,0.532133,46,2,2,-2,-1,...,2240,1681,2267,0,0,2240,0,2267,3074,0
2,3092,320000,0.603733,0.467667,0.4553,41,0,0,0,0,...,35409,35036,35939,5000,2000,2000,3000,3000,1500,0
3,13973,50000,0.603733,0.467667,0.532133,24,-1,-1,-2,-2,...,0,0,0,0,0,0,0,0,0,1
4,10567,80000,0.396267,0.1639,0.4553,52,-1,0,0,0,...,20561,21261,21457,1500,1500,1500,1028,876,1449,0


In [8]:
# Split data into train, validation and hold-out
drop_cols = ['ID']
X, y = train.drop(drop_cols + ['default_payment_next_month'], axis=1), train['default_payment_next_month']
X_train, X_split, y_train, y_split = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y)
X_val, X_hold, y_val, y_hold = train_test_split(X_split, y_split, test_size=0.5, shuffle=True, stratify=y_split)

print("Training data: {}".format(X_train.shape))
print("Validation data: {}".format(X_val.shape))
print("Hold-out data: {}".format(X_hold.shape))

Training data: (14700, 23)
Validation data: (3150, 23)
Hold-out data: (3150, 23)


# **XGBoost**

In [34]:
# Build model
d_train = xgb.DMatrix(data=X_train.values, label=y_train.values)
d_val = xgb.DMatrix(data=X_val.values, label=y_val.values)
d_hold = xgb.DMatrix(data=X_hold.values, label=y_hold.values)

sc_weight = train['default_payment_next_month'].value_counts()[0]/train['default_payment_next_month'].value_counts()[1]

params = {
    'objective': 'binary:logistic',
    'eval_metric': ['auc', 'logloss'],
    'max_depth': 6,
    'learning_rate': 0.002,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'scale_pos_weight': sc_weight,
    'grow_policy': 'lossguide',
    'max_leaves': 31,
    'verbosity': 1
}

clf = xgb.train(
    params, d_train,
    num_boost_round = 5000,
    evals = [(d_train, 'Train'), (d_val, 'Val')],
    verbose_eval = 100
)

[0]	Train-auc:0.791202	Train-logloss:0.692619	Val-auc:0.744748	Val-logloss:0.692684
[100]	Train-auc:0.808576	Train-logloss:0.647567	Val-auc:0.764253	Val-logloss:0.653539
[200]	Train-auc:0.81501	Train-logloss:0.614524	Val-auc:0.769073	Val-logloss:0.625573
[300]	Train-auc:0.817222	Train-logloss:0.589814	Val-auc:0.770164	Val-logloss:0.605558
[400]	Train-auc:0.820208	Train-logloss:0.57059	Val-auc:0.771784	Val-logloss:0.590718
[500]	Train-auc:0.823203	Train-logloss:0.555706	Val-auc:0.773613	Val-logloss:0.579816
[600]	Train-auc:0.824688	Train-logloss:0.544419	Val-auc:0.776109	Val-logloss:0.571759
[700]	Train-auc:0.825931	Train-logloss:0.535509	Val-auc:0.777398	Val-logloss:0.565479
[800]	Train-auc:0.827581	Train-logloss:0.527923	Val-auc:0.778043	Val-logloss:0.56091
[900]	Train-auc:0.82965	Train-logloss:0.521404	Val-auc:0.778268	Val-logloss:0.557541
[1000]	Train-auc:0.832099	Train-logloss:0.515565	Val-auc:0.778659	Val-logloss:0.554766
[1100]	Train-auc:0.834058	Train-logloss:0.510726	Val-auc:0.

In [35]:
hold_probs = clf.predict(d_hold)
hold_preds = np.array([0 if i < 0.5 else 1 for i in hold_probs])
print(classification_report(y_hold, hold_preds))

print("\nROC AUC score: {:.5f}".format(roc_auc_score(y_hold, hold_preds)))

              precision    recall  f1-score   support

           0       0.88      0.80      0.84      2453
           1       0.47      0.62      0.53       697

    accuracy                           0.76      3150
   macro avg       0.67      0.71      0.68      3150
weighted avg       0.79      0.76      0.77      3150


ROC AUC score: 0.70880


In [43]:
# Predict on test data
test_ids = test['ID'].values
X_test = test.drop(drop_cols, axis=1)
d_test = xgb.DMatrix(data=X_test.values)

test_probs = clf.predict(d_test)
test_preds = np.array([0 if i < 0.55 else 1 for i in test_probs])
sub_df = pd.DataFrame(np.vstack((test_ids, test_preds)).T, columns=['ID', 'default_payment_next_month'])
sub_df.to_csv('.././submission/sub3_XGB.csv', index=False)

# **LightGBM**

In [18]:
# Build model
d_train = lgb.Dataset(data=X_train, label=y_train)
d_val = lgb.Dataset(data=X_val, label=y_val)
d_hold = lgb.Dataset(data=X_hold, label=y_hold)

sc_weight = train['default_payment_next_month'].value_counts()[0]/train['default_payment_next_month'].value_counts()[1]

params = {
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 11,
    'learning_rate': 0.001,
    'bagging_fraction': 1.0,
    'feature_fraction': 0.9,
    'scale_pos_weight': 3.0,
    'max_leaves': 31,
    'verbosity': 1
}

clf = lgb.train(
    params, d_train,
    num_boost_round = 2000,
    valid_sets = [d_train, d_val],
    verbose_eval = 100
)

[100]	training's auc: 0.808629	valid_1's auc: 0.774178
[200]	training's auc: 0.812458	valid_1's auc: 0.775081
[300]	training's auc: 0.815849	valid_1's auc: 0.777056
[400]	training's auc: 0.817695	valid_1's auc: 0.777316
[500]	training's auc: 0.819242	valid_1's auc: 0.777329
[600]	training's auc: 0.820775	valid_1's auc: 0.777313
[700]	training's auc: 0.822172	valid_1's auc: 0.777101
[800]	training's auc: 0.823703	valid_1's auc: 0.77695
[900]	training's auc: 0.825157	valid_1's auc: 0.777079
[1000]	training's auc: 0.82715	valid_1's auc: 0.777868
[1100]	training's auc: 0.828877	valid_1's auc: 0.778396
[1200]	training's auc: 0.830579	valid_1's auc: 0.778723
[1300]	training's auc: 0.832283	valid_1's auc: 0.779135
[1400]	training's auc: 0.833896	valid_1's auc: 0.779614
[1500]	training's auc: 0.835392	valid_1's auc: 0.780169
[1600]	training's auc: 0.83696	valid_1's auc: 0.780388
[1700]	training's auc: 0.838428	valid_1's auc: 0.780561
[1800]	training's auc: 0.839822	valid_1's auc: 0.78058
[1900

In [19]:
hold_probs = clf.predict(X_hold)
hold_preds = np.array([0 if i < 0.5 else 1 for i in hold_probs])
print(classification_report(y_hold, hold_preds))

print("\nROC AUC score: {:.5f}".format(roc_auc_score(y_hold, hold_preds)))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87      2453
           1       0.53      0.56      0.55       697

    accuracy                           0.79      3150
   macro avg       0.70      0.71      0.71      3150
weighted avg       0.80      0.79      0.80      3150


ROC AUC score: 0.71078


In [20]:
# Predict on test data
test_ids = test['ID'].values
X_test = test.drop(drop_cols, axis=1)

test_probs = clf.predict(X_test)
test_preds = np.array([0 if i < 0.55 else 1 for i in test_probs])
sub_df = pd.DataFrame(np.vstack((test_ids, test_preds)).T, columns=['ID', 'default_payment_next_month'])
sub_df.to_csv('.././submission/sub10_LGB.csv', index=False)

# **CatBoost**

In [31]:
# Build model
sc_weight = train['default_payment_next_month'].value_counts()[0]/train['default_payment_next_month'].value_counts()[1]

clf = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    iterations=5000,
    learning_rate=0.001,
    depth=6,
    scale_pos_weight=sc_weight,
    max_leaves=31
)

clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose_eval=500)

0:	test: 0.7519059	test1: 0.7425772	best: 0.7425772 (0)	total: 85.2ms	remaining: 7m 6s
500:	test: 0.7812326	test1: 0.7706545	best: 0.7706545 (500)	total: 23.5s	remaining: 3m 31s
1000:	test: 0.7843602	test1: 0.7727189	best: 0.7727189 (1000)	total: 49.7s	remaining: 3m 18s
1500:	test: 0.7877433	test1: 0.7754121	best: 0.7754151 (1499)	total: 1m 16s	remaining: 2m 57s
2000:	test: 0.7911382	test1: 0.7776686	best: 0.7776780 (1998)	total: 1m 41s	remaining: 2m 32s
2500:	test: 0.7939135	test1: 0.7790896	best: 0.7790919 (2499)	total: 2m 7s	remaining: 2m 7s
3000:	test: 0.7963897	test1: 0.7803489	best: 0.7803489 (2999)	total: 2m 33s	remaining: 1m 42s
3500:	test: 0.7984848	test1: 0.7811774	best: 0.7811792 (3499)	total: 2m 59s	remaining: 1m 16s
4000:	test: 0.8004481	test1: 0.7816973	best: 0.7816997 (3999)	total: 3m 25s	remaining: 51.2s
4500:	test: 0.8021096	test1: 0.7821587	best: 0.7821587 (4499)	total: 3m 50s	remaining: 25.5s
4999:	test: 0.8036403	test1: 0.7825258	best: 0.7825299 (4989)	total: 4m 15s

<catboost.core.CatBoostClassifier at 0x7f68ef0b5828>

In [32]:
hold_probs = clf.predict(X_hold)
hold_preds = np.array([0 if i < 0.5 else 1 for i in hold_probs])
print(classification_report(y_hold, hold_preds))

print("\nROC AUC score: {:.5f}".format(roc_auc_score(y_hold, hold_preds)))

              precision    recall  f1-score   support

           0       0.89      0.79      0.83      2453
           1       0.46      0.64      0.54       697

    accuracy                           0.75      3150
   macro avg       0.67      0.71      0.69      3150
weighted avg       0.79      0.75      0.77      3150


ROC AUC score: 0.71477


In [33]:
# Predict on test data
test_ids = test['ID'].values
X_test = test.drop(drop_cols, axis=1)

test_probs = clf.predict(X_test)
test_preds = np.array([0 if i < 0.55 else 1 for i in test_probs])
sub_df = pd.DataFrame(np.vstack((test_ids, test_preds)).T, columns=['ID', 'default_payment_next_month'])
sub_df.to_csv('.././submission/sub12_CAT.csv', index=False)