In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')
data.drop(['ID'], axis=1, inplace=True)

In [None]:
display(data.info())
display(data.isnull().sum())
display(data.shape)

In [None]:
display(data.loc[:, 'SEX':'MARRIAGE'].join(data.PAY_0).describe())

In [None]:
plt.figure(figsize=(14, 6))
sns.distplot(data.LIMIT_BAL, kde=True, bins=220)
plt.show()

In [None]:
plt.figure(figsize=(20, 16))
sns.heatmap(data.loc[:, 'BILL_AMT1':'PAY_AMT6'].corr(), annot=True)

SEX: Gender (1=male, 2=female)

EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)

MARRIAGE: Marital status (1=married, 2=single, 3=others)

PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)

In [None]:
display(sorted(data.SEX.unique()))
display(sorted(data.EDUCATION.unique()))
display(sorted(data.MARRIAGE.unique()))
display(sorted(data.PAY_0.unique()))

In [None]:
display(data.EDUCATION[data.EDUCATION == 0].count())
display(data[data.EDUCATION == 0].head())

display(data.MARRIAGE[data.MARRIAGE == 0].count())
display(data[data.MARRIAGE == 0].head())

display(data.PAY_0.value_counts(sort=False))
display(data[data.PAY_0 == -2].head())

In [None]:
data.loc[data.MARRIAGE == 0, 'MARRIAGE'] = 3
data.loc[(data.EDUCATION == 0) | (data.EDUCATION == 5) | (data.EDUCATION == 6), 'EDUCATION'] = 4

In [None]:
for i in data.loc[:, 'PAY_0':'PAY_6'].columns:
    data.loc[(data[i] == -2) | (data[i] == -1), i] = 0
    
data.rename(columns={'PAY_0':'PAY_1', 'default.payment.next.month': 'default'}, inplace=True)

In [None]:
data.loc[data.SEX == 1, 'SEX'] = 0
data.loc[data.SEX == 2, 'SEX'] = 1

In [None]:
display(data.AGE.describe())

In [None]:
data = data.join(pd.Series(pd.qcut(data.AGE, 3, labels=[1, 2, 3]), name='age_category'))

In [None]:
sex_def = data.groupby(['SEX', 'default']).size().unstack()
edu_def = data.groupby(['EDUCATION', 'default']).size().unstack()
mar_def = data.groupby(['MARRIAGE', 'default']).size().unstack()
age_def = data.groupby(['age_category', 'default']).size().unstack()

In [None]:
sex_def.plot(kind='bar', stacked=True)
edu_def.plot(kind='bar', stacked=True)
mar_def.plot(kind='bar', stacked=True)
age_def.plot(kind='bar', stacked=True)

In [None]:
from sklearn.model_selection import train_test_split

X = data[data.columns.drop(['default', 'AGE'])]
y = data.default

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
display(X_train.shape, X_test.shape)

In [None]:
cat_columns = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'age_category']

In [None]:
from sklearn.preprocessing import OneHotEncoder

oht = OneHotEncoder()

X_train = pd.concat([X_train.drop(cat_columns, axis=1).reset_index(), pd.DataFrame(oht.fit_transform(X_train[cat_columns]).toarray()).reset_index()], axis=1).drop(['index'], axis=1)
X_test = pd.concat([X_test.drop(cat_columns, axis=1).reset_index(), pd.DataFrame(oht.transform(X_test[cat_columns]).toarray()).reset_index()], axis=1).drop(['index'], axis=1)

In [None]:
display(X_train.shape, X_test.shape)

In [None]:
from xgboost import XGBClassifier

cl = XGBClassifier(use_label_encoder=False, eval_metric='auc')
cl.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, cl.predict(X_test))

In [None]:
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
!nvidia-smi

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

cl = XGBClassifier(objective='binary:logistic',
                    silent=True, use_label_encoder=False, eval_metric='auc', nthread=-1)

params = {
        'tree_method': ['gpu_hist'],
        'n_estimators': [100, 200, 350, 500, 600, 700, 1000],
        'min_child_weight': [1, 3, 5, 7, 10],
        'gamma': [0.3, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.3, 0.6, 0.8, 1.0],
        'colsample_bytree': [0.3, 0.5, 0.6, 0.8, 1.0],
        'max_depth': [2, 3, 4, 5],
        'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5],
        }


folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True)

random_search = RandomizedSearchCV(cl, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=-1, cv=skf.split(X_train,y_train), verbose=3)

start_time = timer(None)
random_search.fit(X_train, y_train)
timer(start_time)

In [None]:
random_search.best_estimator_

In [None]:
best_cl = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, eval_metric='auc',
              gamma=5, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=4, min_child_weight=10,
              monotone_constraints='()', n_estimators=350, n_jobs=2, nthread=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, silent=True, subsample=0.8,
              tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, verbosity=0)

best_cl.fit(X_train, y_train)

In [None]:
roc_auc_score(y_test, best_cl.predict(X_test))

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

watchlist = [(dtrain, 'train'), (dtest, 'test')]

params = {
    'objective': 'binary:logistic',
    'max_depth': 4,
    'n_estimators': 350,
    'eval_metric': 'auc',
    'colsample_bytree': 0.5,
    'subsample': 0.8,
    'min_child_weight': 10,
    'gamma': 5,
    'eta': 0.15,
    'silent': True
}

xgb_model = xgb.train(params, dtrain, 2000, watchlist, early_stopping_rounds=300,  maximize=True, verbose_eval=100)

In [None]:
roc_auc_score(y_test, xgb_model.predict(
                        xgb.DMatrix(X_test), ntree_limit=xgb_model.best_iteration))