## Pos-cash balance time series feature extraction
Train GRU network on pos-cash balance time series data. Save prediction to be used as features in final training.

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
import gc

import os
print(os.listdir("../input"))
    
gc.enable()

['download_command.txt', 'sample_submission.csv.zip', 'installments_payments.csv.zip', 'application_test.csv.zip', 'HomeCredit_columns_description.csv', 'previous_application.csv.zip', 'bureau_balance.csv.zip', 'POS_CASH_balance.csv.zip', 'credit_card_balance.csv.zip', 'application_train.csv.zip', 'bureau.csv.zip']


Read pos-cash balance and create features.

In [2]:
pos = pd.read_csv('../input/POS_CASH_balance.csv.zip')
pos = pd.concat([pos, pd.get_dummies(pos['NAME_CONTRACT_STATUS'], prefix='NAME_CONTRACT_STATUS')], axis=1)
pos['CNT_INSTALMENT']/=10
pos['CNT_INSTALMENT_FUTURE']/=10
del pos['NAME_CONTRACT_STATUS']

Read target from main table.

In [4]:
data_app = pd.read_csv('../input/application_train.csv.zip',usecols=['SK_ID_CURR','TARGET'])
data_test = pd.read_csv('../input/application_test.csv.zip',usecols=['SK_ID_CURR'])
data_app.shape, data_test.shape

((307511, 2), (48744, 1))

In [5]:
trn_id = data_app['SK_ID_CURR'].loc[data_app.SK_ID_CURR.isin(pos.SK_ID_CURR)]
test_id = data_test['SK_ID_CURR'].loc[data_test['SK_ID_CURR'].isin(pos.SK_ID_CURR)]
trn_id.shape, test_id.shape

((289444,), (47808,))

Split train and test set. Group by ID and month to create time series.

In [6]:
pos_trn = pos.loc[pos.SK_ID_CURR.isin(trn_id)]
pos_test = pos.loc[pos.SK_ID_CURR.isin(test_id)]
num_aggregations = {
    'SK_ID_PREV': ['count'],
    'CNT_INSTALMENT': ['sum', 'max', 'mean'],
    'CNT_INSTALMENT_FUTURE': ['sum', 'max', 'mean'],
    'NAME_CONTRACT_STATUS_Approved': ['sum'],
    'NAME_CONTRACT_STATUS_Canceled': ['sum'],
    'NAME_CONTRACT_STATUS_Completed': ['sum'],
    'NAME_CONTRACT_STATUS_Demand': ['sum'],
    'NAME_CONTRACT_STATUS_Returned to the store': ['sum'],
    'NAME_CONTRACT_STATUS_Signed': ['sum'],
    'NAME_CONTRACT_STATUS_XNA': ['sum'],
    'SK_DPD': ['sum', 'mean'],
    'SK_DPD_DEF': ['sum', 'mean']
}
pos_trn = pos_trn.groupby(['SK_ID_CURR','MONTHS_BALANCE']).agg(num_aggregations)
pos_test = pos_test.groupby(['SK_ID_CURR','MONTHS_BALANCE']).agg(num_aggregations)
pos_trn.columns = pd.Index([e[0] + "_" + e[1].upper() for e in pos_trn.columns.tolist()])
pos_test.columns = pd.Index([e[0] + "_" + e[1].upper() for e in pos_test.columns.tolist()])
pos_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SK_ID_PREV_COUNT,CNT_INSTALMENT_SUM,CNT_INSTALMENT_MAX,CNT_INSTALMENT_MEAN,CNT_INSTALMENT_FUTURE_SUM,CNT_INSTALMENT_FUTURE_MAX,CNT_INSTALMENT_FUTURE_MEAN,NAME_CONTRACT_STATUS_Approved_SUM,NAME_CONTRACT_STATUS_Canceled_SUM,NAME_CONTRACT_STATUS_Completed_SUM,NAME_CONTRACT_STATUS_Demand_SUM,NAME_CONTRACT_STATUS_Returned to the store_SUM,NAME_CONTRACT_STATUS_Signed_SUM,NAME_CONTRACT_STATUS_XNA_SUM,SK_DPD_SUM,SK_DPD_MEAN,SK_DPD_DEF_SUM,SK_DPD_DEF_MEAN
SK_ID_CURR,MONTHS_BALANCE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
100001,-96,1,0.4,0.4,0.4,0.2,0.2,0.2,0,0,0,0,0,0,0,0,0.0,0,0.0
100001,-95,1,0.4,0.4,0.4,0.1,0.1,0.1,0,0,0,0,0,0,0,7,7.0,7,7.0
100001,-94,1,0.4,0.4,0.4,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0,0.0
100001,-93,1,0.4,0.4,0.4,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0.0,0,0.0
100001,-57,1,0.4,0.4,0.4,0.4,0.4,0.4,0,0,0,0,0,0,0,0,0.0,0,0.0


Convert dataframe to 3D array (n_sample * n_time_step * n_features) for GRU network training.

In [7]:
train_x = pos_trn.to_panel().to_xarray().values
train_x = train_x.swapaxes(0,1).swapaxes(1,2)
test_x = pos_test.to_panel().to_xarray().values
test_x = test_x.swapaxes(0,1).swapaxes(1,2)
train_x[np.isnan(train_x)]=-9
test_x[np.isnan(test_x)]=-9
train_y = data_app['TARGET'].loc[data_app.SK_ID_CURR.isin(trn_id)]
train_x.shape, test_x.shape, train_y.shape

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  """Entry point for launching an IPython kernel.
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  This is separate from the ipykernel package so we can avoid doing imports until


((289444, 96, 18), (47808, 96, 18), (289444,))

Define GRU model. Use callback to evaluate auc metric.

In [8]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, GRU
from keras.regularizers import l2
from keras.optimizers import RMSprop, Adam

def build_model(time_step, n_features):
    model = Sequential()
    model.add(GRU(8, input_shape=(time_step, n_features))) #unit: #of neurons in each LSTM cell? input_shape=(time_step, n_features)
    model.add(Dense(1,activation='sigmoid'))
    return model

from keras.callbacks import Callback
from keras.callbacks import EarlyStopping
import logging

class IntervalEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == (self.interval-1):
            y_pred = self.model.predict(self.X_val, verbose=0)[:,0]
            score = roc_auc_score(self.y_val, y_pred)
            print('roc score',score)

Using TensorFlow backend.


Training...

In [9]:
# Run a 5 fold
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
oof_preds = np.zeros(train_x.shape[0])
sub_preds = np.zeros(test_x.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    trn_x, val_x = train_x[trn_idx], train_x[val_idx]
    trn_y, val_y = train_y.values[trn_idx], train_y.values[val_idx]
    ival = IntervalEvaluation(validation_data=(val_x, val_y), interval=5)
    
    model = build_model(trn_x.shape[1],trn_x.shape[2])
    model.compile(loss='binary_crossentropy', optimizer=Adam(decay=0.0005))
    model.fit(trn_x, trn_y,
              validation_data= [val_x, val_y],
              epochs=20, batch_size=5000, 
              class_weight = {0:1,1:10},
              callbacks=[ival], verbose=5)
    
    oof_preds[val_idx] = model.predict(val_x)[:,0]
    sub_preds += model.predict(test_x)[:,0] / folds.n_splits
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
  
    del model, trn_x, trn_y, val_x, val_y
    gc.collect()

Train on 231554 samples, validate on 57890 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
roc score 0.54927356333507
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
roc score 0.5629433833075327
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
roc score 0.5693661412483946
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
roc score 0.5763580726270465
Fold  1 AUC : 0.576358
Train on 231555 samples, validate on 57889 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
roc score 0.5338057600780024
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
roc score 0.5566578609576867
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
roc score 0.5793328456063923
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
roc score 0.599449726471827
Fold  2 AUC : 0.599450
Train on 231555 samples, validate on 57889 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
roc score 0.5642138293761063
Epoch 6/20
Epoch 7/20
Epoc

Save model prediction to disk.

In [10]:
pos_score_train = pd.DataFrame({'pos_score':oof_preds}, index=trn_id)
pos_score_test = pd.DataFrame({'pos_score':sub_preds}, index=test_id)             
pos_score_train.to_csv('../output/pos_score_train.csv')
pos_score_test.to_csv('../output/pos_score_test.csv')