In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import gc

In [57]:
# load data
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [58]:
def show_all_pandas(df, nrow=None, ncol=None):
    with pd.option_context('display.max_rows', nrow, 'display.max_columns', ncol):
        display(df)

In [59]:
train_df.fillna('missing', inplace=True)

In [60]:
test_df.fillna('missing', inplace=True)

In [61]:
train_df['DisbursalDate'] = pd.to_datetime(train_df['DisbursalDate'], dayfirst=True)

In [62]:
test_df['DisbursalDate'] = pd.to_datetime(test_df['DisbursalDate'], dayfirst=True)

In [63]:
train_df['Date.of.Birth'] = pd.to_datetime(train_df['Date.of.Birth'], dayfirst=True)
train_df['Date.of.Birth'] = train_df['Date.of.Birth'].apply(lambda x: x.replace(year=x.year-100) if x.year > 2000 else x)

In [64]:
test_df['Date.of.Birth'] = pd.to_datetime(test_df['Date.of.Birth'], dayfirst=True)
test_df['Date.of.Birth'] = test_df['Date.of.Birth'].apply(lambda x: x.replace(year=x.year-100) if x.year > 2000 else x)

In [65]:
train_df['cus_age'] = np.floor((train_df['DisbursalDate'] - train_df['Date.of.Birth']).dt.days / 365.25).astype(np.int8)
test_df['cus_age'] = np.floor((test_df['DisbursalDate'] - test_df['Date.of.Birth']).dt.days / 365.25).astype(np.int8)

In [66]:
train_df.loc[(train_df['PERFORM_CNS.SCORE.DESCRIPTION'].str[0:3] == 'Not'), 'PERFORM_CNS.SCORE.DESCRIPTION'] = "Not Scored"
test_df.loc[(test_df['PERFORM_CNS.SCORE.DESCRIPTION'].str[0:3] == 'Not'), 'PERFORM_CNS.SCORE.DESCRIPTION'] = "Not Scored"

In [67]:
train_df['PERFORM_CNS.SCORE.DESCRIPTION'].unique()

array(['No Bureau History Available', 'I-Medium Risk', 'L-Very High Risk',
       'A-Very Low Risk', 'Not Scored', 'D-Very Low Risk',
       'M-Very High Risk', 'B-Very Low Risk', 'C-Very Low Risk',
       'E-Low Risk', 'H-Medium Risk', 'F-Low Risk', 'K-High Risk',
       'G-Low Risk', 'J-High Risk'], dtype=object)

In [68]:
bue_map = {
            'Not Scored': 0,
            'No Bureau History Available' : 1,
            'A-Very Low Risk': 2,
            'B-Very Low Risk': 3,
            'C-Very Low Risk': 4,
            'D-Very Low Risk': 5,
            'E-Low Risk': 6,
            'F-Low Risk': 7,
            'G-Low Risk': 8,
            'H-Medium Risk': 9,
            'I-Medium Risk': 10,
            'J-High Risk': 11,
            'K-High Risk': 12,
            'L-Very High Risk': 13,
            'M-Very High Risk': 14,
}

In [69]:
train_df['PERFORM_CNS.SCORE.DESCRIPTION'] = train_df['PERFORM_CNS.SCORE.DESCRIPTION'].map(bue_map)

In [70]:
test_df['PERFORM_CNS.SCORE.DESCRIPTION'] = test_df['PERFORM_CNS.SCORE.DESCRIPTION'].map(bue_map)

In [71]:
train_df['CREDIT.HISTORY.LENGTH'] = train_df['CREDIT.HISTORY.LENGTH'].str.split().apply(lambda x: (int(x[0][0:-3]) * 12) + int(x[1][0:-3]))
test_df['CREDIT.HISTORY.LENGTH'] = test_df['CREDIT.HISTORY.LENGTH'].str.split().apply(lambda x: (int(x[0][0:-3]) * 12) + int(x[1][0:-3]))
train_df['CREDIT.HISTORY.LENGTH'] = train_df['CREDIT.HISTORY.LENGTH'] / 12
test_df['CREDIT.HISTORY.LENGTH'] = test_df['CREDIT.HISTORY.LENGTH'] / 12

In [72]:
train_df['AVERAGE.ACCT.AGE'] = train_df['AVERAGE.ACCT.AGE'].str.split().apply(lambda x: (int(x[0][0:-3]) * 12) + int(x[1][0:-3]))
test_df['AVERAGE.ACCT.AGE'] = test_df['AVERAGE.ACCT.AGE'].str.split().apply(lambda x: (int(x[0][0:-3]) * 12) + int(x[1][0:-3]))
train_df['AVERAGE.ACCT.AGE'] = train_df['AVERAGE.ACCT.AGE'] / 12
test_df['AVERAGE.ACCT.AGE'] = test_df['AVERAGE.ACCT.AGE'] / 12

In [73]:
show_all_pandas(train_df.head())

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,cus_age
0,420825,50578,58400,89.55,67,22807,45,1441,1984-01-01,Salaried,2018-08-03,6,1998,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,34
1,537409,47145,65550,73.23,67,22807,45,1502,1985-07-31,Self employed,2018-09-26,6,1998,1,1,0,0,0,0,598,10,1,1,1,27600,50200,50200,0,0,0,0,0,0,1991,0,0,1,1.916667,1.916667,0,1,33
2,417566,53278,61360,89.63,67,22807,45,1497,1985-08-24,Self employed,2018-08-01,6,1998,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,32
3,624493,57513,66113,88.48,67,22807,45,1501,1993-12-30,Self employed,2018-10-26,6,1998,1,1,0,0,0,0,305,13,3,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,0.666667,1.25,1,1,24
4,539055,52378,60300,88.39,67,22807,45,1495,1977-12-09,Self employed,2018-09-26,6,1998,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1,1,40


In [19]:
exclude_feats = ['UniqueID', 'asset_cost', 'supplier_id', 'Current_pincode_ID', 'Date.of.Birth', 'DisbursalDate', 'Employee_code_ID', 'MobileNo_Avl_Flag', 'PERFORM_CNS.SCORE']

features = [col for col in test_df.columns if col not in exclude_feats]

features

['disbursed_amount',
 'ltv',
 'branch_id',
 'manufacturer_id',
 'Employment.Type',
 'State_ID',
 'Aadhar_flag',
 'PAN_flag',
 'VoterID_flag',
 'Driving_flag',
 'Passport_flag',
 'PERFORM_CNS.SCORE.DESCRIPTION',
 'PRI.NO.OF.ACCTS',
 'PRI.ACTIVE.ACCTS',
 'PRI.OVERDUE.ACCTS',
 'PRI.CURRENT.BALANCE',
 'PRI.SANCTIONED.AMOUNT',
 'PRI.DISBURSED.AMOUNT',
 'SEC.NO.OF.ACCTS',
 'SEC.ACTIVE.ACCTS',
 'SEC.OVERDUE.ACCTS',
 'SEC.CURRENT.BALANCE',
 'SEC.SANCTIONED.AMOUNT',
 'SEC.DISBURSED.AMOUNT',
 'PRIMARY.INSTAL.AMT',
 'SEC.INSTAL.AMT',
 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
 'AVERAGE.ACCT.AGE',
 'CREDIT.HISTORY.LENGTH',
 'NO.OF_INQUIRIES',
 'cus_age']

In [20]:
cat_inds = [2, 3, 4, 5, 6, 7, 8]

In [40]:
train_df_ts = train_df.sort_values(['DisbursalDate']).reset_index(drop=True)

In [41]:
show_all_pandas(train_df_ts.head(10))

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,cus_age
0,417581,49803,68826,74.1,77,21772,86,2293,1990-01-12,Salaried,2018-08-01,4,420,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1,1,28
1,418698,62635,75886,83.77,136,15523,86,3742,1983-05-27,Salaried,2018-08-01,8,2172,1,1,0,0,0,0,17,0,1,1,0,25200,42000,42000,0,0,0,0,0,0,0,0,1,0,0.166667,0.166667,0,0,35
2,418913,55959,67100,84.95,19,22335,45,1873,1983-01-01,Salaried,2018-08-01,4,1505,1,1,0,0,0,0,737,4,1,1,0,35817,48000,48000,1,0,0,0,0,0,0,0,0,0,2.166667,3.666667,0,0,35
3,418318,41787,53811,79.34,136,15523,86,3723,1985-03-26,Salaried,2018-08-01,8,2172,1,1,0,0,0,0,786,3,37,14,0,8473757,9403861,9316764,0,0,0,0,0,0,33330,0,0,0,2.0,13.5,0,0,33
4,418974,37775,46505,83.86,136,18651,48,3822,1991-01-31,Salaried,2018-08-01,8,140,1,1,0,0,0,0,675,7,4,3,0,1146051,1263207,1254489,0,0,0,0,0,0,14480,0,0,0,2.166667,3.25,0,0,27
5,418419,50078,60741,83.96,61,15897,45,1359,1991-04-01,Salaried,2018-08-01,6,1184,1,1,0,0,0,0,726,5,5,1,0,62166,93000,93000,0,0,0,0,0,0,0,0,0,0,1.583333,3.166667,0,0,27
6,417802,60864,83415,74.08,136,15523,86,3753,1984-06-07,Salaried,2018-08-01,8,2172,1,1,0,0,0,0,743,4,11,6,0,197045,393900,261606,0,0,0,0,0,0,0,0,2,0,0.583333,1.166667,0,0,34
7,418620,56259,64092,88.93,67,22703,86,1511,1994-10-23,Salaried,2018-08-01,6,1845,1,1,0,0,0,0,825,2,2,0,0,0,0,0,0,0,0,0,0,0,3512,0,0,0,0.583333,0.583333,0,0,23
8,418820,35289,58890,67.92,67,22727,45,1509,1975-02-11,Salaried,2018-08-01,6,1845,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,43
9,418128,43894,58320,77.16,160,18696,45,94,1961-01-01,Salaried,2018-08-01,16,716,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,1,57


In [27]:
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold, TimeSeriesSplit

In [28]:
import catboost as cb

In [23]:
def run_cv_model(train, target, model_fn, params={}, n_repeats=1, n_splits=5, test=None, label='model', cv_verbose = False, model_verbose=False):
    kf = RepeatedStratifiedKFold(n_splits=n_splits, random_state=42, n_repeats = n_repeats)
    fold_splits = kf.split(train, target)
    auc_scores = []
    pred_full_test = np.zeros((test.shape[0], n_splits*n_repeats))
    pred_train = np.zeros(train.shape[0])
    feature_importance_df = pd.DataFrame()
    i = 1
    for dev_index, val_index in fold_splits:
        if cv_verbose:
            print('\nStarted ' + label + ' fold ' + str(i) + '/'+str(n_splits*n_repeats))
            print("#########################")
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y, importances, auc = model_fn(dev_X, dev_y, val_X, val_y, test, params2, model_verbose)
        pred_full_test[:, i-1] = pred_test_y
        pred_train[val_index] = pred_val_y
        
        auc_scores.append(auc)
        if cv_verbose:
            print(label + ' cv score {}: AUC {}'.format(i, auc))
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = train.columns.values
        fold_importance_df['importance'] = importances
        fold_importance_df['fold'] = i
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)        
        i += 1
    
    if cv_verbose:
        print("\n\n CV - Report\n")
        print('{} cv AUC scores : {}'.format(label, auc_scores))
        print('{} cv mean AUC score : {}'.format(label, np.mean(auc_scores)))
        print('{} cv std AUC score : {}'.format(label, np.std(auc_scores)))
    results = {'label': label,
               'train': pred_train, 'test': pred_full_test,
                'auc': auc_scores,
               'importance': feature_importance_df}
    return results

In [37]:
def run_cv_model_ts(train, target, model_fn, params={}, n_splits=5, test=None, label='model', cv_verbose = False, model_verbose=False):
    kf = TimeSeriesSplit(n_splits=n_splits)
    fold_splits = kf.split(train, target)
    auc_scores = []
    pred_full_test = np.zeros((test.shape[0], n_splits))
    pred_train = np.zeros(train.shape[0])
    feature_importance_df = pd.DataFrame()
    i = 1
    for dev_index, val_index in fold_splits:
        if cv_verbose:
            print('\nStarted ' + label + ' fold ' + str(i) + '/'+str(n_splits))
            print("#########################")
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y, importances, auc = model_fn(dev_X, dev_y, val_X, val_y, test, params2, model_verbose)
        pred_full_test[:, i-1] = pred_test_y
        pred_train[val_index] = pred_val_y
        
        auc_scores.append(auc)
        if cv_verbose:
            print(label + ' cv score {}: AUC {}'.format(i, auc))
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = train.columns.values
        fold_importance_df['importance'] = importances
        fold_importance_df['fold'] = i
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)        
        i += 1
    
    if cv_verbose:
        print("\n\n CV - Report\n")
        print('{} cv AUC scores : {}'.format(label, auc_scores))
        print('{} cv mean AUC score : {}'.format(label, np.mean(auc_scores)))
        print('{} cv std AUC score : {}'.format(label, np.std(auc_scores)))
    results = {'label': label,
               'train': pred_train, 'test': pred_full_test,
                'auc': auc_scores,
               'importance': feature_importance_df}
    return results

In [38]:
def runCatBoostCls(train_X, train_y, test_X, test_y, test_X2, params, model_verbose):
    if model_verbose:
        print('Prep Catboost Classifier')
    cat_ind = params.pop('cat_features')
    val_pool = cb.Pool(test_X.values, test_y.values, cat_features=cat_ind)
    model = cb.CatBoostClassifier()
    model.set_params(** params)
    if model_verbose:
        print('Train Catboost Classifier')
    model.fit(X=train_X, y=train_y, eval_set=val_pool, cat_features=cat_ind, use_best_model=True)
    
    if model_verbose:
        print('Predict validation set')
    pred_test_y = model.predict_proba(test_X)
    pred_test_y = list(pred_test_y[:,1].flatten())
    auc = roc_auc_score(test_y, pred_test_y)
    if model_verbose:
        print('Predict actual test set')
    if test_X2 is not None:
        pred_test_y2 = model.predict_proba(test_X2)
        pred_test_y2 = list(pred_test_y2[:,1].flatten())
    else:
        pred_test_y2 = 0
    print('returning')
    return pred_test_y, pred_test_y2, model.feature_importances_, auc

In [44]:
catboost_params = {'iterations': 3000,
                   'learning_rate': 0.08,
                   'depth': 8,
                   'loss_function': 'Logloss',
                   'eval_metric': 'AUC',
                   'random_seed': 2018,
                   'l2_leaf_reg': 3,
                   'scale_pos_weight': 3,
                   'early_stopping_rounds': 400,
                   'use_best_model': True,
                   'verbose': 100,
                   'cat_features': cat_inds}

results = run_cv_model(train_df[features], train_df['loan_default'], runCatBoostCls, catboost_params, n_splits=5, test=test_df[features], label='CatBoost', cv_verbose=True, model_verbose=True)


Started CatBoost fold 1/5
#########################
Prep Catboost Classifier
Train Catboost Classifier
0:	test: 0.6157366	best: 0.6157366 (0)	total: 555ms	remaining: 27m 44s
100:	test: 0.6577800	best: 0.6577800 (100)	total: 43.2s	remaining: 20m 39s
200:	test: 0.6602210	best: 0.6602210 (200)	total: 1m 24s	remaining: 19m 33s
300:	test: 0.6604096	best: 0.6606512 (260)	total: 2m 5s	remaining: 18m 45s
400:	test: 0.6607388	best: 0.6608212 (396)	total: 2m 47s	remaining: 18m 4s
500:	test: 0.6602864	best: 0.6609824 (414)	total: 3m 28s	remaining: 17m 20s
600:	test: 0.6591961	best: 0.6609824 (414)	total: 4m 10s	remaining: 16m 41s
700:	test: 0.6587684	best: 0.6609824 (414)	total: 4m 53s	remaining: 16m 2s
800:	test: 0.6585668	best: 0.6609824 (414)	total: 5m 34s	remaining: 15m 19s
Stopped by overfitting detector  (400 iterations wait)

bestTest = 0.6609823578
bestIteration = 414

Shrink model to first 415 iterations.
Predict validation set
Predict actual test set
returning
CatBoost cv score 1: AUC 

In [49]:
imports = results['importance'].groupby('feature')['feature', 'importance'].mean().reset_index()
show_all_pandas(imports.sort_values('importance', ascending=False).reset_index(drop=True))

Unnamed: 0,feature,importance
0,branch_id,12.579313
1,ltv,11.461632
2,disbursed_amount,7.38861
3,PERFORM_CNS.SCORE.DESCRIPTION,7.209915
4,cus_age,6.431777
5,State_ID,6.396551
6,manufacturer_id,5.344998
7,PRIMARY.INSTAL.AMT,4.949186
8,Employment.Type,4.518866
9,CREDIT.HISTORY.LENGTH,4.23364


In [50]:
exclude_feats2 = ['UniqueID', 'asset_cost', 'supplier_id', 'Current_pincode_ID', 'Date.of.Birth', 'DisbursalDate', 'Employee_code_ID', 'MobileNo_Avl_Flag', 'VoterID_flag', 'PAN_flag', 'SEC.CURRENT.BALANCE', 'Aadhar_flag', 'SEC.SANCTIONED.AMOUNT', 'Driving_flag', 'SEC.INSTAL.AMT', 'SEC.NO.OF.ACCTS', 'Passport_flag', 'SEC.OVERDUE.ACCTS', 'SEC.DISBURSED.AMOUNT', 'SEC.ACTIVE.ACCTS']

features2 = [col for col in test_df.columns if col not in exclude_feats2]

features2

['disbursed_amount',
 'ltv',
 'branch_id',
 'manufacturer_id',
 'Employment.Type',
 'State_ID',
 'PERFORM_CNS.SCORE',
 'PERFORM_CNS.SCORE.DESCRIPTION',
 'PRI.NO.OF.ACCTS',
 'PRI.ACTIVE.ACCTS',
 'PRI.OVERDUE.ACCTS',
 'PRI.CURRENT.BALANCE',
 'PRI.SANCTIONED.AMOUNT',
 'PRI.DISBURSED.AMOUNT',
 'PRIMARY.INSTAL.AMT',
 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
 'AVERAGE.ACCT.AGE',
 'CREDIT.HISTORY.LENGTH',
 'NO.OF_INQUIRIES',
 'cus_age']

In [51]:
cat_ind2 = [2, 3, 4, 5, 7]

In [53]:
catboost_params2 = {'iterations': 3000,
                   'learning_rate': 0.08,
                   'depth': 8,
                   'loss_function': 'Logloss',
                   'eval_metric': 'AUC',
                   'random_seed': 2018,
                   'l2_leaf_reg': 3,
                   'scale_pos_weight': 3,
                   'early_stopping_rounds': 200,
                   'use_best_model': True,
                   'verbose': 100,
                   'cat_features': cat_ind2}

results2 = run_cv_model(train_df[features2], train_df['loan_default'], runCatBoostCls, catboost_params2, n_splits=5, test=test_df[features2], label='CatBoost', cv_verbose=True, model_verbose=True)


Started CatBoost fold 1/5
#########################
Prep Catboost Classifier
Train Catboost Classifier
0:	test: 0.6186508	best: 0.6186508 (0)	total: 581ms	remaining: 29m 2s
100:	test: 0.6574175	best: 0.6574175 (100)	total: 47.7s	remaining: 22m 48s
200:	test: 0.6598508	best: 0.6598508 (200)	total: 1m 34s	remaining: 21m 52s
300:	test: 0.6606556	best: 0.6606556 (300)	total: 2m 21s	remaining: 21m 5s
400:	test: 0.6602555	best: 0.6607516 (337)	total: 3m 7s	remaining: 20m 14s
500:	test: 0.6598965	best: 0.6607516 (337)	total: 3m 54s	remaining: 19m 31s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.660751607
bestIteration = 337

Shrink model to first 338 iterations.
Predict validation set
Predict actual test set
returning
CatBoost cv score 1: AUC 0.6607516070235795

Started CatBoost fold 2/5
#########################
Prep Catboost Classifier
Train Catboost Classifier
0:	test: 0.6258443	best: 0.6258443 (0)	total: 484ms	remaining: 24m 12s
100:	test: 0.6604530	best: 0.660475

In [75]:
train_df['neg_p_o'] = 0
train_df.loc[(train_df['PRI.CURRENT.BALANCE'] < 0), 'neg_p_o'] = 1
train_df['PRI.CURRENT.BALANCE'] = np.abs(train_df['PRI.CURRENT.BALANCE'])

test_df['neg_p_o'] = 0
test_df.loc[(test_df['PRI.CURRENT.BALANCE'] < 0), 'neg_p_o'] = 1
test_df['PRI.CURRENT.BALANCE'] = np.abs(test_df['PRI.CURRENT.BALANCE'])

train_df['neg_s_o'] = 0
train_df.loc[(train_df['SEC.CURRENT.BALANCE'] < 0), 'neg_s_o'] = 1
train_df['SEC.CURRENT.BALANCE'] = np.abs(train_df['SEC.CURRENT.BALANCE'])

test_df['neg_s_o'] = 0
test_df.loc[(test_df['SEC.CURRENT.BALANCE'] < 0), 'neg_s_o'] = 1
test_df['SEC.CURRENT.BALANCE'] = np.abs(test_df['SEC.CURRENT.BALANCE'])

In [76]:
train_df['sec_acc'] = (train_df['SEC.ACTIVE.ACCTS'] > 0).astype(int)
test_df['sec_acc'] = (test_df['SEC.ACTIVE.ACCTS'] > 0).astype(int)

In [54]:
from sklearn.preprocessing import MinMaxScaler

In [77]:
train_df['disbursed_amount'] = np.log1p(train_df['disbursed_amount'])
test_df['disbursed_amount'] = np.log1p(test_df['disbursed_amount'])

train_df['ltv'] = np.log1p(train_df['ltv'])
test_df['ltv'] = np.log1p(test_df['ltv'])

train_df['AVERAGE.ACCT.AGE'] = np.log1p(train_df['AVERAGE.ACCT.AGE'])
test_df['AVERAGE.ACCT.AGE'] = np.log1p(test_df['AVERAGE.ACCT.AGE'])

train_df['CREDIT.HISTORY.LENGTH'] = np.log1p(train_df['CREDIT.HISTORY.LENGTH'])
test_df['CREDIT.HISTORY.LENGTH'] = np.log1p(test_df['CREDIT.HISTORY.LENGTH'])

train_df['NEW.ACCTS.IN.LAST.SIX.MONTHS'] = np.log1p(train_df['NEW.ACCTS.IN.LAST.SIX.MONTHS'])
test_df['NEW.ACCTS.IN.LAST.SIX.MONTHS'] = np.log1p(test_df['NEW.ACCTS.IN.LAST.SIX.MONTHS'])

train_df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'] = np.log1p(train_df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'])
test_df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'] = np.log1p(test_df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'])

train_df['PRIMARY.INSTAL.AMT'] = np.log1p(train_df['PRIMARY.INSTAL.AMT'])
test_df['PRIMARY.INSTAL.AMT'] = np.log1p(test_df['PRIMARY.INSTAL.AMT'])

train_df['SEC.INSTAL.AMT'] = np.log1p(train_df['SEC.INSTAL.AMT'])
test_df['SEC.INSTAL.AMT'] = np.log1p(test_df['SEC.INSTAL.AMT'])

train_df['PRI.NO.OF.ACCTS'] = np.log1p(train_df['PRI.NO.OF.ACCTS'])
test_df['PRI.NO.OF.ACCTS'] = np.log1p(test_df['PRI.NO.OF.ACCTS'])

train_df['PRI.ACTIVE.ACCTS'] = np.log1p(train_df['PRI.ACTIVE.ACCTS'])
test_df['PRI.ACTIVE.ACCTS'] = np.log1p(test_df['PRI.ACTIVE.ACCTS'])

train_df['PRI.OVERDUE.ACCTS'] = np.log1p(train_df['PRI.OVERDUE.ACCTS'])
test_df['PRI.OVERDUE.ACCTS'] = np.log1p(test_df['PRI.OVERDUE.ACCTS'])

train_df['PRI.CURRENT.BALANCE'] = np.log1p(train_df['PRI.CURRENT.BALANCE'])
test_df['PRI.CURRENT.BALANCE'] = np.log1p(test_df['PRI.CURRENT.BALANCE'])

train_df['PRI.SANCTIONED.AMOUNT'] = np.log1p(train_df['PRI.SANCTIONED.AMOUNT'])
test_df['PRI.SANCTIONED.AMOUNT'] = np.log1p(test_df['PRI.SANCTIONED.AMOUNT'])

train_df['PRI.DISBURSED.AMOUNT'] = np.log1p(train_df['PRI.DISBURSED.AMOUNT'])
test_df['PRI.DISBURSED.AMOUNT'] = np.log1p(test_df['PRI.DISBURSED.AMOUNT'])




In [80]:
scale_cols = ['disbursed_amount', 'ltv', 'PERFORM_CNS.SCORE', 'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'cus_age']

In [81]:
mm_scal = MinMaxScaler()
mm_scal.fit(train_df[scale_cols])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [83]:
train_df[scale_cols] = mm_scal.transform(train_df[scale_cols])

In [86]:
test_df.fillna(0, inplace=True)

In [87]:
test_df[scale_cols] = mm_scal.transform(test_df[scale_cols])

In [91]:
train_df['NO.OF_INQUIRIES'] = train_df['NO.OF_INQUIRIES'] / 36
test_df['NO.OF_INQUIRIES'] = test_df['NO.OF_INQUIRIES'] / 36

In [92]:
show_all_pandas(train_df.head(20))

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,cus_age,neg_p_o,neg_s_o,sec_acc
0,420825,0.309634,58400,0.972988,67,22807,45,1441,1984-01-01,Salaried,2018-08-03,6,1998,1,1,0,0,0,0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.313725,0,0,0
1,537409,0.293322,65550,0.88114,67,22807,45,1502,1985-07-31,Self employed,2018-09-26,6,1998,1,1,0,0,0,0,0.67191,0.714286,0.113295,0.139278,0.212746,0.556184,0.522301,0.522301,0,0,0,0,0,0,0.44531,0.0,0.0,0.22767,0.309565,0.290181,0.0,1,0.294118,0,0,0
2,417566,0.321703,61360,0.973396,67,22807,45,1497,1985-08-24,Self employed,2018-08-01,6,1998,1,1,0,0,0,0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.27451,0,0,0
3,624493,0.339454,66113,0.967495,67,22807,45,1501,1993-12-30,Self employed,2018-10-26,6,1998,1,1,0,0,0,0,0.342697,0.928571,0.226589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.203153,0.0,0.0,0.0,0.147727,0.219831,0.027778,1,0.117647,0,0,0
4,539055,0.317749,60300,0.967029,67,22807,45,1495,1977-12-09,Self employed,2018-09-26,6,1998,1,1,0,0,0,0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,1,0.431373,0,0,0
5,518279,0.327021,61900,0.973549,67,22807,45,1501,1990-09-08,Self employed,2018-09-19,6,1998,1,1,0,0,0,0,0.926966,0.142857,0.179568,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.422419,0.0,0.0,0.0,0.292548,0.297817,0.0,0,0.196078,0,0,0
6,529269,0.28937,61500,0.900587,67,22807,45,1502,1988-06-01,Salaried,2018-09-23,6,1998,1,1,0,0,0,0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.235294,0,0,0
7,510278,0.27674,61900,0.872721,67,22807,45,1501,1989-10-04,Salaried,2018-09-16,6,1998,1,1,0,0,0,0,0.019101,0.0,0.113295,0.139278,0.0,0.608995,0.541351,0.541351,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.044579,0.041788,0.0,0,0.196078,0,0,0
8,490213,0.32359,61973,0.973039,67,22807,45,1497,1991-11-15,Self employed,2018-09-05,6,1998,1,1,0,0,0,0,0.806742,0.357143,0.113295,0.139278,0.0,0.203297,0.618083,0.618083,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.501635,0.470224,0.027778,0,0.156863,1,0,0
9,510980,0.318744,61300,0.959524,67,22807,45,1492,1968-06-01,Salaried,2018-09-16,6,1998,1,0,0,1,0,0,0.919101,0.142857,0.113295,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.461127,0.0,0.0,0.0,0.274468,0.257282,0.0,0,0.627451,0,0,0


In [93]:
cat_cols_en = ['branch_id', 'manufacturer_id', 'Current_pincode_ID', 'Employment.Type', 'State_ID']

In [94]:
ext_col = ['MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag', 'Driving_flag', 'Passport_flag', 'neg_p_o', 'neg_s_o', 'sec_acc']

In [95]:
total_features = scale_cols + cat_cols_en + ext_col

In [96]:
total_features

['disbursed_amount',
 'ltv',
 'PERFORM_CNS.SCORE',
 'PERFORM_CNS.SCORE.DESCRIPTION',
 'PRI.NO.OF.ACCTS',
 'PRI.ACTIVE.ACCTS',
 'PRI.OVERDUE.ACCTS',
 'PRI.CURRENT.BALANCE',
 'PRI.SANCTIONED.AMOUNT',
 'PRI.DISBURSED.AMOUNT',
 'PRIMARY.INSTAL.AMT',
 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
 'AVERAGE.ACCT.AGE',
 'CREDIT.HISTORY.LENGTH',
 'cus_age',
 'branch_id',
 'manufacturer_id',
 'Current_pincode_ID',
 'Employment.Type',
 'State_ID',
 'MobileNo_Avl_Flag',
 'Aadhar_flag',
 'PAN_flag',
 'VoterID_flag',
 'Driving_flag',
 'Passport_flag',
 'neg_p_o',
 'neg_s_o',
 'sec_acc']

In [97]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [100]:
for col in cat_cols_en:
    trn, sub = target_encode(train_df[col], 
                             test_df[col], 
                             target=train_df['loan_default'], 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
    train_df[col] = trn
    test_df[col] = sub

In [101]:
show_all_pandas(train_df.head(10))

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,cus_age,neg_p_o,neg_s_o,sec_acc
0,420825,0.309634,58400,0.972988,0.195111,22807,0.227631,0.217467,1984-01-01,0.200523,2018-08-03,0.208994,1998,1,1,0,0,0,0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.313725,0,0,0
1,537409,0.293322,65550,0.88114,0.193174,22807,0.232773,0.216088,1985-07-31,0.226719,2018-09-26,0.206863,1998,1,1,0,0,0,0,0.67191,0.714286,0.113295,0.139278,0.212746,0.556184,0.522301,0.522301,0,0,0,0,0,0,0.44531,0.0,0.0,0.22767,0.309565,0.290181,0.0,1,0.294118,0,0,0
2,417566,0.321703,61360,0.973396,0.194894,22807,0.22882,0.215965,1985-08-24,0.22896,2018-08-01,0.204488,1998,1,1,0,0,0,0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.27451,0,0,0
3,624493,0.339454,66113,0.967495,0.195601,22807,0.231241,0.219112,1993-12-30,0.227171,2018-10-26,0.204273,1998,1,1,0,0,0,0,0.342697,0.928571,0.226589,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.203153,0.0,0.0,0.0,0.147727,0.219831,0.027778,1,0.117647,0,0,0
4,539055,0.317749,60300,0.967029,0.194398,22807,0.227148,0.218643,1977-12-09,0.229196,2018-09-26,0.205853,1998,1,1,0,0,0,0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,1,0.431373,0,0,0
5,518279,0.327021,61900,0.973549,0.193186,22807,0.229948,0.217825,1990-09-08,0.2288,2018-09-19,0.206477,1998,1,1,0,0,0,0,0.926966,0.142857,0.179568,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.422419,0.0,0.0,0.0,0.292548,0.297817,0.0,0,0.196078,0,0,0
6,529269,0.28937,61500,0.900587,0.196405,22807,0.226682,0.220156,1988-06-01,0.203745,2018-09-23,0.208517,1998,1,1,0,0,0,0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.235294,0,0,0
7,510278,0.27674,61900,0.872721,0.193837,22807,0.229414,0.215941,1989-10-04,0.203657,2018-09-16,0.202678,1998,1,1,0,0,0,0,0.019101,0.0,0.113295,0.139278,0.0,0.608995,0.541351,0.541351,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.044579,0.041788,0.0,0,0.196078,0,0,0
8,490213,0.32359,61973,0.973039,0.195868,22807,0.22908,0.213601,1991-11-15,0.228982,2018-09-05,0.209344,1998,1,1,0,0,0,0,0.806742,0.357143,0.113295,0.139278,0.0,0.203297,0.618083,0.618083,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.501635,0.470224,0.027778,0,0.156863,1,0,0
9,510980,0.318744,61300,0.959524,0.19104,22807,0.22931,0.220682,1968-06-01,0.201675,2018-09-16,0.206541,1998,1,0,0,1,0,0,0.919101,0.142857,0.113295,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.461127,0.0,0.0,0.0,0.274468,0.257282,0.0,0,0.627451,0,0,0


In [114]:
from sklearn.linear_model import LogisticRegression

In [119]:
train_df_ts = train_df.sort_values(['DisbursalDate']).reset_index(drop=True)

In [120]:
logreg = LogisticRegression(C=1, class_weight={0:1, 1:5}, random_state=2018, n_jobs=-1, max_iter=1000, penalty='l2')

In [121]:
logreg.fit(train_df[total_features], train_df['loan_default'])

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1, class_weight={0: 1, 1: 5}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=2018,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [122]:
roc_auc_score(train_df['loan_default'], logreg.predict_proba(train_df[total_features])[:,1])

0.651201656515449

In [127]:
sub_df = pd.DataFrame({'UniqueID': test_df['UniqueID']})
sub_df['loan_default'] = results['test'].sum(axis=1)

In [130]:
lr_sub = logreg.predict_proba(test_df[total_features])[:,1]

In [131]:
np.corrcoef(lr_sub, sub_df['loan_default'])

array([[1.        , 0.83046781],
       [0.83046781, 1.        ]])

In [132]:
sub_df1 = pd.DataFrame({'UniqueID': test_df['UniqueID']})
sub_df1['loan_default'] = lr_sub

In [133]:
sub_df3 = pd.DataFrame({'UniqueID': test_df['UniqueID']})
sub_df3['loan_default'] = (lr_sub + sub_df['loan_default']) / 2

In [134]:
sub_df.to_csv('sub_cat.csv', index=False)
sub_df1.to_csv('sub_lr.csv', index=False)
sub_df3.to_csv('sub_avg.csv', index=False)

In [135]:
sub_df4 = pd.DataFrame({'UniqueID': test_df['UniqueID']})
sub_df4['loan_default'] = results2['test'].sum(axis=1)

In [136]:
sub_df4.to_csv('sub_cat2.csv', index=False)

In [137]:
sub_df5 = pd.DataFrame({'UniqueID': test_df['UniqueID']})
sub_df5['loan_default'] = (lr_sub + sub_df4['loan_default']) / 2

In [139]:
sub_df5.to_csv('sub_avg2.csv', index=False)

In [140]:
sub_df6 = pd.DataFrame({'UniqueID': test_df['UniqueID']})
sub_df6['loan_default'] = ( 1* lr_sub +  1.5 * sub_df['loan_default']) / 2.5

In [141]:
sub_df6.to_csv('sub_avg3.csv', index=False)