In [4]:
import numpy as np
import pandas as pd
import importlib
import sys

from pytorch_tabnet.tab_model import TabNetClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from data_processing import timer, one_hot_encoder, application_train_test, bureau_and_balance, previous_applications, pos_cash, installments_payments, credit_card_balance

from collections import defaultdict, Counter

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os
import wget
from pathlib import Path

import gc
import time
from contextlib import contextmanager
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from matplotlib import pyplot as plt
%matplotlib inline

In [5]:
LIGHTGBM_PARAMS = {
    'boosting_type': 'goss',
    'n_estimators': 2000,
    'learning_rate': 0.005134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 1,
    'is_unbalance': False,
    'silent':-1,
    'verbose': 1,
    'nthread': 4,
    'random_state': 1234,
    'metric' : 'auc',
    'n_jobs': 4
}

In [6]:
XGBOOST_PARAMS = {
    "learning_rate" : 0.01, 
    "n_estimators" : 2000, 
    "max_depth" : 4, 
    "min_child_weight" : 5, 
    "subsample" : 0.8, 
    "colsample_bytree" : 0.8, 
    "objective" : 'binary:logistic', 
    "scale_pos_weight" : 2.5,
    "reg_lambda" : 1.2,
#     "nthread" : 4,
    "seed" : 1234,
    'metric' : 'auc',
}

In [7]:
def eval_model(df):
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0)
    X = df.drop(['TARGET','SK_ID_CURR'], axis=1).values
    y = df["TARGET"].values
    
    train_ratio = 0.8
    validation_ratio = 0.1
    test_ratio = 0.1

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))
    
    clf_lgbm = LGBMClassifier(**LIGHTGBM_PARAMS)
    clf_lgbm.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric= 'auc', 
        verbose=1, 
        early_stopping_rounds=200)
    
    y_pred_lgmb = clf_lgbm.predict_proba(X_test)[:, 1]
    print("LightGBM AUC score is:", roc_auc_score(y_test, y_pred_lgmb))
    
    clf_xgb = XGBClassifier(**XGBOOST_PARAMS)
    clf_xgb.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric= 'auc', 
        verbose=1, 
        early_stopping_rounds=200)
    y_pred_xgb = clf_xgb.predict_proba(X_test)[:, 1]
    print("XGBoost AUC score is:", roc_auc_score(y_test, y_pred_xgb))
    
    clf_tabnet = TabNetClassifier(optimizer_fn=torch.optim.Adam)
    max_epochs = 25 if not os.getenv("CI", False) else 2
    clf_tabnet.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_name=['train', 'valid'],
        eval_metric=['auc'],
        max_epochs=max_epochs , patience=20,
        batch_size=1024, virtual_batch_size=128,
        num_workers=0,
        weights=1,
        drop_last=False
    ) 
    y_pred_tabnet = clf_tabnet.predict_proba(X_test)[:, 1]
    print("TabNet AUC score is:", roc_auc_score(y_test, y_pred_tabnet))

# Application

In [8]:
num_rows = None
df = application_train_test(num_rows)

Train samples: 307511


In [9]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0)

In [10]:
X = df.drop(['TARGET','SK_ID_CURR'], axis=1).values
y = df["TARGET"].values

In [31]:
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [32]:
clf_lgbm = LGBMClassifier(**LIGHTGBM_PARAMS)

In [None]:
clf_lgbm.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric= 'auc', 
        verbose=1, 
        early_stopping_rounds=200)

In [34]:
y_pred_lgmb = clf_lgbm.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_lgmb)

0.7576280833106248

In [35]:
clf_xgb = XGBClassifier(**XGBOOST_PARAMS)

In [None]:
clf_xgb.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric= 'auc', 
        verbose=1, 
        early_stopping_rounds=200)

In [37]:
y_pred_xgb = clf_xgb.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_xgb)

0.7571485844005286

In [38]:
clf_tabnet = TabNetClassifier(optimizer_fn=torch.optim.Adam)

Device used : cpu


In [39]:
max_epochs = 25 if not os.getenv("CI", False) else 2

In [None]:
clf_tabnet.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
) 

In [41]:
y_pred_tabnet = clf_tabnet.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_tabnet)

0.7364908788226617

# Credit card balance

In [2]:
num_rows = None
df = application_train_test(num_rows)

with timer("Process credit card balance"):
    cc = credit_card_balance(num_rows)
    print("Credit card balance df shape:", cc.shape)
    df = df.join(cc, how='left', on='SK_ID_CURR')
    del cc
    gc.collect()

Train samples: 307511
Credit card balance df shape: (103558, 141)
Process credit card balance - done in 20s


In [3]:
df= df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0)

In [4]:
X = df.drop(['TARGET','SK_ID_CURR'], axis=1).values
y = df["TARGET"].values

In [5]:
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [9]:
clf_lgbm = LGBMClassifier(**LIGHTGBM_PARAMS)

In [None]:
clf_lgbm.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric= 'auc', 
        verbose=1, 
        early_stopping_rounds=200)

In [11]:
y_pred_lgmb = clf_lgbm.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_lgmb)

0.7649790136960927

In [12]:
clf_xgb = XGBClassifier(**XGBOOST_PARAMS)

In [None]:
clf_xgb.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric= 'auc', 
        verbose=1, 
        early_stopping_rounds=200)

In [14]:
y_pred_xgb = clf_xgb.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_xgb)

0.7643793080006586

In [15]:
clf_tabnet = TabNetClassifier(optimizer_fn=torch.optim.Adam)

Device used : cpu


  return torch._C._cuda_getDeviceCount() > 0


In [16]:
max_epochs = 25 if not os.getenv("CI", False) else 2

In [None]:
clf_tabnet.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
) 

In [18]:
y_pred_tabnet = clf_tabnet.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_tabnet)

0.728675276184152

# All data

In [13]:
num_rows = None
df = application_train_test(num_rows)
with timer("Process bureau and bureau_balance"):
    bureau = bureau_and_balance(num_rows)
    print("Bureau df shape:", bureau.shape)
    df = df.join(bureau, how='left', on='SK_ID_CURR')
    del bureau
    gc.collect()
with timer("Process previous_applications"):
    prev = previous_applications(num_rows)
    print("Previous applications df shape:", prev.shape)
    df = df.join(prev, how='left', on='SK_ID_CURR')
    del prev
    gc.collect()
with timer("Process POS-CASH balance"):
    pos = pos_cash(num_rows)
    print("Pos-cash balance df shape:", pos.shape)
    df = df.join(pos, how='left', on='SK_ID_CURR')
    del pos
    gc.collect()
with timer("Process installments payments"):
    ins = installments_payments(num_rows)
    print("Installments payments df shape:", ins.shape)
    df = df.join(ins, how='left', on='SK_ID_CURR')
    del ins
    gc.collect()
with timer("Process credit card balance"):
    cc = credit_card_balance(num_rows)
    print("Credit card balance df shape:", cc.shape)
    df = df.join(cc, how='left', on='SK_ID_CURR')
    del cc
    gc.collect()

Train samples: 307511
Bureau df shape: (305811, 116)
Process bureau and bureau_balance - done in 29s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance - done in 13s
Installments payments df shape: (339587, 26)
Process installments payments - done in 41s
Credit card balance df shape: (103558, 141)
Process credit card balance - done in 19s


In [14]:
df= df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0)

In [15]:
X = df.drop(['TARGET','SK_ID_CURR'], axis=1).values
y = df["TARGET"].values

In [16]:
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [17]:
clf_lgbm = LGBMClassifier(**LIGHTGBM_PARAMS)

In [None]:
clf_lgbm.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric= 'auc', 
        verbose=1, 
        early_stopping_rounds=200)

In [64]:
y_pred_lgmb = clf_lgbm.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_lgmb)

0.7833630731456633

In [20]:
clf_xgb = XGBClassifier(**XGBOOST_PARAMS)

In [None]:
clf_xgb.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric= 'auc', 
        verbose=1, 
        early_stopping_rounds=200)

In [65]:
y_pred_xgb = clf_xgb.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_xgb)

0.7817701318236424

In [23]:
clf_tabnet = TabNetClassifier(optimizer_fn=torch.optim.Adam)

Device used : cpu


  return torch._C._cuda_getDeviceCount() > 0


In [24]:
max_epochs = 25 if not os.getenv("CI", False) else 2

In [None]:
clf_tabnet.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
) 

In [66]:
y_pred_tabnet = clf_tabnet.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_tabnet)

0.7555494629759629