In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

import xgboost as xgb

from xgboost import XGBClassifier


import gc


from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

from sklearn.feature_selection import VarianceThreshold



In [2]:
print('Importing data...')

data = pd.read_csv(r'data/application_train.csv')

test = pd.read_csv(r'data/application_test.csv')

prev = pd.read_csv(r'data/previous_application.csv')

buro = pd.read_csv(r'data/bureau.csv')

buro_balance = pd.read_csv(r'data/bureau_balance.csv')

lgbm_submission = pd.read_csv(r'data/sample_submission.csv')

Importing data...


In [3]:
#Separate target variable
y = data['TARGET']
del data['TARGET']

#Feature engineering
#One-hot encoding of categorical features in data and test sets

categorical_features = [col for col in data.columns if data[col].dtype == 'object']
one_hot_df = pd.concat([data,test])
one_hot_df = pd.get_dummies(one_hot_df, columns=categorical_features)


data = one_hot_df.iloc[:data.shape[0],:]
test = one_hot_df.iloc[data.shape[0]:,]

In [4]:
#Pre-processing buro_balance

print('Pre-processing buro_balance...')

buro_grouped_size = buro_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].size()

buro_grouped_max = buro_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].max()

buro_grouped_min = buro_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].min()

buro_grouped_mean = buro_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].mean()


buro_counts = buro_balance.groupby('SK_ID_BUREAU')['STATUS'].value_counts(normalize = False)
buro_counts_unstacked = buro_counts.unstack('STATUS')
buro_counts_unstacked.columns = ['STATUS_0', 'STATUS_1','STATUS_2','STATUS_3','STATUS_4','STATUS_5','STATUS_C','STATUS_X',]
buro_counts_unstacked['MONTHS_COUNT'] = buro_grouped_size
buro_counts_unstacked['MONTHS_MIN'] = buro_grouped_min
buro_counts_unstacked['MONTHS_MAX'] = buro_grouped_max
buro_counts_unstacked['MONTHS_MEAN'] = buro_grouped_mean



buro1 = buro.join(buro_counts_unstacked.fillna(0), how='left', on='SK_ID_BUREAU')

Pre-processing buro_balance...


In [5]:
buro.shape

(1716428, 17)

In [6]:
#Pre-processing previous_application

print('Pre-processing previous_application...')

#One-hot encoding of categorical features in previous application data set

prev_cat_features = [pcol for pcol in prev.columns if prev[pcol].dtype == 'object']

prev = pd.get_dummies(prev, columns=prev_cat_features)

avg_prev = prev.groupby('SK_ID_CURR').mean()
avg_prev2 = prev.groupby('SK_ID_CURR').min()
avg_prev3 = prev.groupby('SK_ID_CURR').max()

cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()

avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']

del avg_prev['SK_ID_PREV'], avg_prev2['SK_ID_PREV'], avg_prev3['SK_ID_PREV']

Pre-processing previous_application...


In [7]:
#Pre-processing buro

print('Pre-processing buro...')

#One-hot encoding of categorical features in buro data set

buro_cat_features = [bcol for bcol in buro1.columns if buro1[bcol].dtype == 'object']

buro1 = pd.get_dummies(buro, columns=buro_cat_features)

avg_buro = buro1.groupby('SK_ID_CURR').mean()
avg_buro2 = buro1.groupby('SK_ID_CURR').min()
avg_buro3 = buro1.groupby('SK_ID_CURR').max()

avg_buro['buro_count'] = buro1[['SK_ID_BUREAU', 'SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']

del avg_buro['SK_ID_BUREAU'], avg_buro2['SK_ID_BUREAU'], avg_buro3['SK_ID_BUREAU']

Pre-processing buro...


In [8]:
#Pre-processing POS_CASH

print('Pre-processing POS_CASH...')
POS_CASH  = pd.read_csv(r'data/POS_CASH_balance.csv')

le = LabelEncoder()

POS_CASH['NAME_CONTRACT_STATUS'] = le.fit_transform(POS_CASH['NAME_CONTRACT_STATUS'].astype(str))

nunique_status = POS_CASH[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').nunique()

nunique_status2 = POS_CASH[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').max()

nunique_status3 = POS_CASH[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').min()

POS_CASH['NUNIQUE_STATUS'] = nunique_status['NAME_CONTRACT_STATUS']
POS_CASH['NUNIQUE_STATUS2'] = nunique_status2['NAME_CONTRACT_STATUS']
POS_CASH['NUNIQUE_STATUS3'] = nunique_status3['NAME_CONTRACT_STATUS']

POS_CASH.drop(['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], axis=1, inplace=True)

Pre-processing POS_CASH...


In [9]:
#Pre-processing credit_card

print('Pre-processing credit_card...')

credit_card  = pd.read_csv(r'data/credit_card_balance.csv')

credit_card['NAME_CONTRACT_STATUS'] = le.fit_transform(credit_card['NAME_CONTRACT_STATUS'].astype(str))

nunique_status = credit_card[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').nunique()

nunique_status2 = credit_card[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').max()

nunique_status3 = credit_card[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').min()

nunique_status4 = credit_card[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').mean()

credit_card['NUNIQUE_STATUS'] = nunique_status['NAME_CONTRACT_STATUS']

credit_card['NUNIQUE_STATUS2'] = nunique_status2['NAME_CONTRACT_STATUS']

credit_card['NUNIQUE_STATUS3'] = nunique_status3['NAME_CONTRACT_STATUS']

credit_card['NUNIQUE_STATUS4'] = nunique_status4['NAME_CONTRACT_STATUS']

credit_card.drop(['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], axis=1, inplace=True)

Pre-processing credit_card...


In [10]:
#Pre-processing payments

print('Pre-processing payments...')

payments = pd.read_csv(r'data/installments_payments.csv')

payments['PAYMENT_PERC'] = payments['AMT_PAYMENT'] / payments['AMT_INSTALMENT']
payments['PAYMENT_DIFF'] = payments['AMT_INSTALMENT'] - payments['AMT_PAYMENT']

# Days past due and days before due (no negative values)
payments['DPD'] = payments['DAYS_ENTRY_PAYMENT'] - payments['DAYS_INSTALMENT']
payments['DBD'] = payments['DAYS_INSTALMENT'] - payments['DAYS_ENTRY_PAYMENT']
payments['DPD'] = payments['DPD'].apply(lambda x: x if x > 0 else 0)
payments['DBD'] = payments['DBD'].apply(lambda x: x if x > 0 else 0)

avg_payments = payments.groupby('SK_ID_CURR').mean()

avg_payments2 = payments.groupby('SK_ID_CURR').max()

avg_payments3 = payments.groupby('SK_ID_CURR').min()

avg_payments4 = payments.groupby('SK_ID_CURR').sum()

del avg_payments['SK_ID_PREV']

Pre-processing payments...


In [11]:
#Join data bases

print('Joining databases...')

data = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')



data = data.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')


data = data.merge(right=avg_buro2.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_buro2.reset_index(), how='left', on='SK_ID_CURR')


data = data.merge(right=avg_buro3.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_buro3.reset_index(), how='left', on='SK_ID_CURR')


data = data.merge(POS_CASH.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(POS_CASH.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')



data = data.merge(credit_card.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(credit_card.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')



data = data.merge(right=avg_payments.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_payments.reset_index(), how='left', on='SK_ID_CURR')



data = data.merge(right=avg_payments2.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_payments2.reset_index(), how='left', on='SK_ID_CURR')



data = data.merge(right=avg_payments3.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_payments3.reset_index(), how='left', on='SK_ID_CURR')


data = data.merge(right=avg_payments4.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_payments4.reset_index(), how='left', on='SK_ID_CURR')

Joining databases...


In [12]:
print('Some feature engineering...')

data['DAYS_EMPLOYED_PER_AGE'] = data['DAYS_EMPLOYED']/data['DAYS_BIRTH']
data['AMT_CREDIT_PER_INCOME'] = data['AMT_INCOME_TOTAL']/data['AMT_CREDIT_x']
data['INCOME_PER_PERSON'] = data['AMT_INCOME_TOTAL'] / data['CNT_FAM_MEMBERS']
data['ANNUITY_INCOME_PERC'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL']
data['PAYMENT_RATE'] = data['AMT_ANNUITY'] / data['AMT_CREDIT_x']

test['DAYS_EMPLOYED_PER_AGE'] = test['DAYS_EMPLOYED']/test['DAYS_BIRTH']
test['AMT_CREDIT_PER_INCOME'] = test['AMT_INCOME_TOTAL']/test['AMT_CREDIT_x']
test['INCOME_PER_PERSON'] = test['AMT_INCOME_TOTAL'] / test['CNT_FAM_MEMBERS']
test['ANNUITY_INCOME_PERC'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']
test['PAYMENT_RATE'] = test['AMT_ANNUITY'] / test['AMT_CREDIT_x']

#data['APP_CREDIT_PERC'] = data['AMT_APPLICATION']/data['AMT_ANNUITY_x']

data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
data['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
data['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
data['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
data['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
data['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
test['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
test['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
test['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
test['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
test['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

Some feature engineering...


In [13]:
#Remove features with many missing values

print('Removing features with more than 80% missing...')

test = test[test.columns[data.isnull().mean() < 0.85]]

data = data[data.columns[data.isnull().mean() < 0.85]]

Removing features with more than 80% missing...


In [14]:
excluded_feats = ['SK_ID_CURR']

features = [f_ for f_ in data.columns if f_ not in excluded_feats]

In [22]:
from sklearn.utils.class_weight import compute_class_weight

weights = compute_class_weight('balanced', np.unique(y), y)

In [23]:
weights

array([ 0.54390914,  6.19357503])

In [15]:
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import KFold

In [25]:
from lightgbm import LGBMClassifier

folds = KFold(n_splits=5, shuffle = True)
oof_preds_lgbm = np.zeros(data.shape[0])
sub_preds_lgbm = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
    trn_x, trn_y = data[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = data[features].iloc[val_idx], y.iloc[val_idx]
    
    clf = LGBMClassifier(
            is_unbalance = True, 
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=30,
            colsample_bytree=.8,
            subsample=.9,
            max_depth=8,
            reg_alpha=.1,
            reg_lambda=.1,
            min_split_gain=.01,
            min_child_weight=2,
            silent=-1,
            verbose=-1,
        )
    
    clf.fit(trn_x, trn_y, eval_set =  [(trn_x, trn_y), (val_x, val_y)], eval_metric='auc', verbose=100, early_stopping_rounds=100)
    
    oof_preds_lgbm[val_idx] = clf.predict_proba(val_x)[:, 1]

    sub_preds_lgbm += clf.predict_proba(test[features])[:, 1] / folds.n_splits

    

    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds_lgbm[val_idx])))

    del clf, trn_x, trn_y, val_x, val_y

    gc.collect()
    
print('Full AUC score %.6f using LGBM' % roc_auc_score(y, oof_preds_lgbm))   

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.772096	valid_1's auc: 0.758585
[200]	training's auc: 0.794025	valid_1's auc: 0.773256
[300]	training's auc: 0.808033	valid_1's auc: 0.780151
[400]	training's auc: 0.818558	valid_1's auc: 0.783209
[500]	training's auc: 0.827663	valid_1's auc: 0.7851
[600]	training's auc: 0.835681	valid_1's auc: 0.786117
[700]	training's auc: 0.843287	valid_1's auc: 0.786719
[800]	training's auc: 0.850339	valid_1's auc: 0.786943
[900]	training's auc: 0.856819	valid_1's auc: 0.787268
[1000]	training's auc: 0.863	valid_1's auc: 0.787491
[1100]	training's auc: 0.868782	valid_1's auc: 0.787438
Early stopping, best iteration is:
[1002]	training's auc: 0.86311	valid_1's auc: 0.787493
Fold  1 AUC : 0.787493
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.772491	valid_1's auc: 0.753039
[200]	training's auc: 0.795304	valid_1's auc: 0.768511
[300]	training's auc: 0.809172	valid_1's auc: 0.7

In [26]:
print('Full AUC score %.6f using bagging XGB and LGBM' % roc_auc_score(y, oof_preds_lgbm))   

Full AUC score 0.786329 using bagging XGB and LGBM


In [27]:
test['TARGET'] = sub_preds_lgbm
test['TARGET'].max()

0.95547940510565998

In [28]:
test[['SK_ID_CURR','TARGET']].to_csv('lgb_submission_esi.csv', index=False, float_format='%.8f')

In [None]:
folds = KFold(n_splits=4, shuffle = True)
oof_preds = np.zeros(data.shape[0])
sub_preds = np.zeros(test.shape[0])
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
    trn_x, trn_y = data[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = data[features].iloc[val_idx], y.iloc[val_idx]
    
    clf = XGBClassifier(
         objective = 'binary:logistic', 

        booster = "gbtree",

        eval_metric = 'auc', 

        nthread = 4,

        eta = 0.05,

        gamma = 0,

        max_depth = 6, 

        subsample = 0.7, 

        colsample_bytree = 0.7, 

        colsample_bylevel = 0.675,

        min_child_weight = 22,

        alpha = 0,

        random_state = 42, 

        nrounds = 3000 , 
        )
    
    clf.fit(trn_x, trn_y, eval_set =  [(trn_x, trn_y), (val_x, val_y)], verbose=10, early_stopping_rounds=30)
    
    oof_preds[val_idx] = clf.predict_proba(val_x)[:, 1]

    sub_preds += clf.predict_proba(test[features])[:, 1] / folds.n_splits

    

    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))

    del clf, trn_x, trn_y, val_x, val_y

    gc.collect()
    
print('Full AUC score %.6f using XGB' % roc_auc_score(y, oof_preds))   


In [None]:
from keras.models import Sequential, Model
from keras.layers import Input, Flatten, Convolution1D, Dense, MaxPool1D, BatchNormalization, Embedding


loan = Input(shape=(data.shape[1],))

embed = Embedding(input_dim = data.shape[0], 
                    output_dim = 256)(loan)

conv = Convolution1D(96,5,
                       strides = 2, 
                       activation = 'relu')(embed)

pool = MaxPool1D(3,
                    strides = 2,
                   )(conv)

flatten = Flatten()(pool)

fc1 = Dense(512, 
               activation = 'relu'
               )(flatten)

fc2 =  Dense(1,
                activation = 'sigmoid')(fc1)

model = Model(loan, fc2)

In [None]:
model.summary()

In [None]:
model.compile(loss = 'binary_crossentropy', 
             optimizer = 'adam', 
              metrics = ['accuracy']) 

In [None]:
model.fit(data,y,
          batch_size=128, 
          validation_split=0.2, 
         epochs = 10 )

In [None]:
data_dl = np.zeros((data.shape[0],data.shape[1],1))
data_dl[:,:,0] = data