In [25]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import impute
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import ensemble
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [26]:
app_train_mod = pd.read_csv('./data/app_train_mod.csv')
app_test_mod = pd.read_csv('./data/app_test_mod.csv')
bureau = pd.read_csv('./data/bureau.csv')
bureau_balance = pd.read_csv('./data/bureau_balance.csv')

In [27]:
bureau_train = pd.merge(app_train_mod[['SK_ID_CURR']], bureau, on = 'SK_ID_CURR', how = 'inner')
bureau_test = pd.merge(app_test_mod[['SK_ID_CURR']], bureau, on = 'SK_ID_CURR', how = 'inner')

In [28]:
# Modifying AMT_CREDIT_SUM_DEBT for anomalies (high -ve values): These are mainly in cards where a few entries have 
# taken unused credit limit as -ve debt
bureau_train['AMT_CREDIT_SUM_DEBT'][-bureau_train['AMT_CREDIT_SUM_DEBT'] > .05*bureau_train['AMT_CREDIT_SUM']] = 0
bureau_test['AMT_CREDIT_SUM_DEBT'][-bureau_test['AMT_CREDIT_SUM_DEBT'] > .05*bureau_test['AMT_CREDIT_SUM']] = 0

### bureau.csv

#### Adding new modified features

In [29]:
# Total of debt, overdue amount, count of credits prolonged and annuity amount per SK_ID_CURR
bureau_train_new = bureau.groupby('SK_ID_CURR', as_index = False).agg({'AMT_CREDIT_SUM_DEBT': 'sum', 'AMT_CREDIT_SUM_OVERDUE': 'sum',
                              'CNT_CREDIT_PROLONG': 'sum', 'AMT_ANNUITY': 'sum',  
                                'DAYS_CREDIT': ['mean', 'max'], 'CREDIT_DAY_OVERDUE': ['mean', 'max']})
bureau_train_new.columns = ['SK_ID_CURR', 'AMT_CREDIT_SUM_DEBT_TOT', 'AMT_CREDIT_SUM_OVERDUE_TOT',
       'CNT_CREDIT_PROLONG_TOT', 'AMT_ANNUITY_TOT', 'DAYS_CREDIT_mean', 'DAYS_CREDIT_max',
                          'CREDIT_DAY_OVERDUE_mean', 'CREDIT_DAY_OVERDUE_max']

# Count of active bureau loans per SK_ID_CURR
bureau_train_new1 = bureau[bureau.CREDIT_ACTIVE == 'Active'].groupby('SK_ID_CURR', as_index = False).count()[['SK_ID_CURR','CREDIT_ACTIVE']]
bureau_train_new1.columns = ['SK_ID_CURR', 'CNT_ACTIVE_LOAN']

# Count of Bad debt loans per SK_ID_CURR
bureau_train_new2 = bureau[bureau.CREDIT_ACTIVE == 'Bad debt'].groupby('SK_ID_CURR', as_index = False).count()[['SK_ID_CURR','CREDIT_ACTIVE']]
bureau_train_new2.columns = ['SK_ID_CURR', 'CNT_BAD_DEBT']

# Count of bureau loans applied in last one year per SK_ID_CURR
bureau_train_new3 = bureau[bureau.DAYS_CREDIT >= -365].groupby('SK_ID_CURR', as_index = False).count()[['SK_ID_CURR','DAYS_CREDIT']]
bureau_train_new3.columns = ['SK_ID_CURR', 'CNT_APP_1YR']

# Merging all the new features
x = pd.merge(bureau_train_new1, bureau_train_new2, how='outer', on = 'SK_ID_CURR')
x = pd.merge(x, bureau_train_new3, how='outer', on = 'SK_ID_CURR')
bureau_train_new = pd.merge(bureau_train_new, x, how='outer', on = 'SK_ID_CURR')
bureau_train_new.iloc[:, 1:] = bureau_train_new.iloc[:, 1:].apply(lambda x: (x - np.mean(x)) / np.std(x), axis = 0)
bureau_train_new = bureau_train_new.fillna(0)

In [30]:
# Total of debt, overdue amount, count of credits prolonged and annuity amount per SK_ID_CURR
bureau_test_new = bureau.groupby('SK_ID_CURR', as_index = False).agg({'AMT_CREDIT_SUM_DEBT': 'sum', 'AMT_CREDIT_SUM_OVERDUE': 'sum',
                              'CNT_CREDIT_PROLONG': 'sum', 'AMT_ANNUITY': 'sum',  
                                'DAYS_CREDIT': ['mean', 'max'], 'CREDIT_DAY_OVERDUE': ['mean', 'max']})
bureau_test_new.columns = ['SK_ID_CURR', 'AMT_CREDIT_SUM_DEBT_TOT', 'AMT_CREDIT_SUM_OVERDUE_TOT',
       'CNT_CREDIT_PROLONG_TOT', 'AMT_ANNUITY_TOT', 'DAYS_CREDIT_mean', 'DAYS_CREDIT_max',
                          'CREDIT_DAY_OVERDUE_mean', 'CREDIT_DAY_OVERDUE_max']

# Count of active bureau loans per SK_ID_CURR
bureau_test_new1 = bureau[bureau.CREDIT_ACTIVE == 'Active'].groupby('SK_ID_CURR', as_index = False).count()[['SK_ID_CURR','CREDIT_ACTIVE']]
bureau_test_new1.columns = ['SK_ID_CURR', 'CNT_ACTIVE_LOAN']

# Count of Bad debt loans per SK_ID_CURR
bureau_test_new2 = bureau[bureau.CREDIT_ACTIVE == 'Bad debt'].groupby('SK_ID_CURR', as_index = False).count()[['SK_ID_CURR','CREDIT_ACTIVE']]
bureau_test_new2.columns = ['SK_ID_CURR', 'CNT_BAD_DEBT']

# Count of bureau loans applied in last one year per SK_ID_CURR
bureau_test_new3 = bureau[bureau.DAYS_CREDIT >= -365].groupby('SK_ID_CURR', as_index = False).count()[['SK_ID_CURR','DAYS_CREDIT']]
bureau_test_new3.columns = ['SK_ID_CURR', 'CNT_APP_1YR']

# Merging all the new features
x = pd.merge(bureau_test_new1, bureau_test_new2, how='outer', on = 'SK_ID_CURR')
x = pd.merge(x, bureau_test_new3, how='outer', on = 'SK_ID_CURR')
bureau_test_new = pd.merge(bureau_test_new, x, how='outer', on = 'SK_ID_CURR')
bureau_test_new.iloc[:, 1:] = bureau_test_new.iloc[:, 1:].apply(lambda x: (x - np.mean(x)) / np.std(x), axis = 0)
bureau_test_new = bureau_test_new.fillna(0)

### bureau_balance.csv

In [31]:
# No. of times a SK_ID_BUREAU had DPD 1, 2, 3, 4, 5 in last one year
balance12 = bureau_balance[(bureau_balance.MONTHS_BALANCE >= -12) & ~(bureau_balance.STATUS.isin(['0', 'C', 'X']))]

DPD1_1YR = balance12[bureau_balance.STATUS == '1'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD1_1YR.columns = ('SK_ID_BUREAU', 'DPD1_1YR')

DPD2_1YR = balance12[bureau_balance.STATUS == '2'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD2_1YR.columns = ('SK_ID_BUREAU', 'DPD2_1YR')

DPD3_1YR = balance12[bureau_balance.STATUS == '3'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD3_1YR.columns = ('SK_ID_BUREAU', 'DPD3_1YR')

DPD4_1YR = balance12[bureau_balance.STATUS == '4'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD4_1YR.columns = ('SK_ID_BUREAU', 'DPD4_1YR')

DPD5_1YR = balance12[bureau_balance.STATUS == '5'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD5_1YR.columns = ('SK_ID_BUREAU', 'DPD5_1YR')

DPDany_1YR = balance12[bureau_balance.STATUS.isin(['1','2','3','4','5'])].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPDany_1YR.columns = ('SK_ID_BUREAU', 'DPDany_1YR')

DPD_1YR = pd.merge(DPD1_1YR, DPD2_1YR, how='outer', on = 'SK_ID_BUREAU')
DPD_1YR = pd.merge(DPD_1YR, DPD3_1YR, how='outer', on = 'SK_ID_BUREAU')
DPD_1YR = pd.merge(DPD_1YR, DPD4_1YR, how='outer', on = 'SK_ID_BUREAU')
DPD_1YR = pd.merge(DPD_1YR, DPD5_1YR, how='outer', on = 'SK_ID_BUREAU')
DPD_1YR = pd.merge(DPD_1YR, DPDany_1YR, how='outer', on = 'SK_ID_BUREAU')

# No. of times a SK_ID_BUREAU had DPD 1, 2, 3, 4, 5 or any  overall
DPD1_overall = bureau_balance[bureau_balance.STATUS == '1'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD1_overall.columns = ('SK_ID_BUREAU', 'DPD1_overall')

DPD2_overall = bureau_balance[bureau_balance.STATUS == '2'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD2_overall.columns = ('SK_ID_BUREAU', 'DPD2_overall')

DPD3_overall = bureau_balance[bureau_balance.STATUS == '3'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD3_overall.columns = ('SK_ID_BUREAU', 'DPD3_overall')

DPD4_overall = bureau_balance[bureau_balance.STATUS == '4'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD4_overall.columns = ('SK_ID_BUREAU', 'DPD4_overall')

DPD5_overall = bureau_balance[bureau_balance.STATUS == '5'].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPD5_overall.columns = ('SK_ID_BUREAU', 'DPD5_overall')

DPDany_overall = bureau_balance[bureau_balance.STATUS.isin(['1','2','3','4','5'])].groupby('SK_ID_BUREAU', as_index=False).count()[['SK_ID_BUREAU', 'STATUS']]
DPDany_overall.columns = ('SK_ID_BUREAU', 'DPDany_overall')

DPD_overall = pd.merge(DPD1_overall, DPD2_overall, how='outer', on = 'SK_ID_BUREAU')
DPD_overall = pd.merge(DPD_overall, DPD3_overall, how='outer', on = 'SK_ID_BUREAU')
DPD_overall = pd.merge(DPD_overall, DPD4_overall, how='outer', on = 'SK_ID_BUREAU')
DPD_overall = pd.merge(DPD_overall, DPD5_overall, how='outer', on = 'SK_ID_BUREAU')
DPD_overall = pd.merge(DPD_overall, DPDany_overall, how='outer', on = 'SK_ID_BUREAU')

In [32]:
bureau_bal_new = pd.merge(DPD_1YR, DPD_overall, how='outer', on = 'SK_ID_BUREAU')
bureau_bal_new = pd.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], bureau_bal_new, how='left', on = 'SK_ID_BUREAU')
bureau_bal_new.iloc[:, 1:] = bureau_bal_new.iloc[:, 1:].apply(lambda x: (x - np.mean(x)) / np.std(x), axis = 0)
bureau_bal_new = bureau_bal_new.fillna(0)
bureau_bal_new = bureau_bal_new.drop('SK_ID_BUREAU', axis = 1).groupby('SK_ID_CURR', as_index = False).sum()

### Final DataFrames

Merging bureau_bal_new with bureau_new

In [33]:
bureau_train_new = pd.merge(bureau_train_new, bureau_bal_new, on = 'SK_ID_CURR', how = 'left')
bureau_test_new = pd.merge(bureau_test_new, bureau_bal_new, on = 'SK_ID_CURR', how = 'left')

Merging with main application dataset

In [34]:
app_bur_train = pd.merge(app_train_mod, bureau_train_new, on = 'SK_ID_CURR', how = 'left')
app_bur_test = pd.merge(app_test_mod, bureau_test_new, on = 'SK_ID_CURR', how = 'left')
app_bur_train = app_bur_train.fillna(0)
app_bur_test = app_bur_test.fillna(0)

In [39]:
# app_bur_train.to_csv('./data/app_bur_train.csv', index = False)
# app_bur_test.to_csv('./data/app_bur_test.csv', index = False)

### Model Development

#### Train-Test Split

In [36]:
# Train-Test Split
one_index = app_bur_train[app_bur_train.TARGET == 1].index
zero_index = app_bur_train[app_bur_train.TARGET == 0].index

trainindex1 = np.random.choice(one_index, size = int(0.7*one_index.shape[0]), replace = False)
trainindex0 = np.random.choice(zero_index, size = int(0.7*zero_index.shape[0]), replace = False)
trainindex = np.concatenate([trainindex1, trainindex0])
testindex = np.delete(app_bur_train.index, trainindex)

fit_df = app_bur_train.iloc[trainindex]
validation_df = app_bur_train.iloc[testindex]

X_fit = fit_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_fit = fit_df['TARGET']
X_validation = validation_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_validation = validation_df['TARGET']

#### Models

##### Random Forest

In [37]:
# Validation
rfmodel = ensemble.RandomForestClassifier(class_weight='balanced')
param_grid = {'n_estimators': [500], 'max_depth':[9], 'max_features':[12]}
gridmodel = model_selection.GridSearchCV(rfmodel, param_grid, scoring = 'roc_auc', cv = 10)
gridmodel.fit(X_fit, y_fit)

print('n_estimators:', gridmodel.best_estimator_.n_estimators, '  ',
      'max_depth:', gridmodel.best_estimator_.max_depth, '  ',
      'max_features:', gridmodel.best_estimator_.max_features)
print('train_roc:', round(gridmodel.score(X_fit, y_fit), 3), '  ',
      'validation_roc:', round(gridmodel.score(X_validation, y_validation), 3))

n_estimators: 500    max_depth: 9    max_features: 12
train_roc: 0.796    validation_roc: 0.746


In [38]:
print(gridmodel.cv_results_['split0_test_score'], gridmodel.cv_results_['split1_test_score'], 
      gridmodel.cv_results_['split2_test_score'], gridmodel.cv_results_['split3_test_score'],
      gridmodel.cv_results_['split4_test_score'], gridmodel.cv_results_['split5_test_score'],
     gridmodel.cv_results_['split6_test_score'], gridmodel.cv_results_['split7_test_score'],
     gridmodel.cv_results_['split8_test_score'], gridmodel.cv_results_['split9_test_score'])


[0.73320651] [0.745017] [0.7526683] [0.74630336] [0.74101244] [0.74925354] [0.74814274] [0.74788384] [0.74706643] [0.74092569]


In [14]:
# Kaggle
X_train = app_bur_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_train = app_bur_train.TARGET
X_test = app_bur_test.drop('SK_ID_CURR', axis = 1)

# Model fit and prediction
model = ensemble.RandomForestClassifier(n_estimators = 200, max_depth = 3, max_features = 4, class_weight = 'balanced')
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)

In [33]:
sol = pd.DataFrame({'SK_ID_CURR': app_bur_test.SK_ID_CURR, 'TARGET': y_pred[:,1]})
sol.to_csv('./soln/sol_bureaurf.csv', index = False)                   # kaggle: 0.71

##### Stochastic Gradient BOOSTING

In [50]:
# upsample 
xs = app_bur_train.drop('SK_ID_CURR', axis = 1)
ys = app_bur_train.TARGET

X_train, X_validation, y_train, y_validation = model_selection.train_test_split(xs, ys,
                                                                                test_size=0.30, random_state=0, 
                                                                                stratify=app_bur_train.TARGET)
X_train_0 = X_train[X_train.TARGET == 0]
X_train_1 = X_train[X_train.TARGET == 1]
X_train_1 = X_train_1.sample(X_train_0.shape[0], replace=True)
X_train = pd.concat([X_train_0, X_train_1], axis = 0)
y_train = X_train.TARGET
X_train = X_train.drop('TARGET', axis = 1)
X_validation = X_validation.drop('TARGET', axis = 1)

In [51]:
# Validation

sgbmodel = ensemble.GradientBoostingClassifier(subsample=0.7)
param_grid = {"n_estimators": [200], "learning_rate": [0.05], "max_depth": [6]}
gridmodel = model_selection.GridSearchCV(sgbmodel, param_grid, scoring = 'roc_auc')
gridmodel.fit(X_train, y_train)

print(gridmodel.best_estimator_)
print('n_estimators:', gridmodel.best_estimator_.n_estimators, '  ',
      'learning_rate:', gridmodel.best_estimator_.learning_rate, '  ',
      'max_depth:', gridmodel.best_estimator_.max_depth)
print('train_roc:', round(gridmodel.score(X_train, y_train), 3), '  ',
      'validation_roc:', round(gridmodel.score(X_validation, y_validation), 3))

sgb = pd.DataFrame(gridmodel.cv_results_)[['param_n_estimators', 'param_learning_rate', 'param_max_depth', 
                                      'mean_test_score', 'mean_train_score', 'std_test_score']]
try:
    sgb_results = pd.concat([sgb_results, sgb])
except:
    sgb_results = sgb.copy()
sgb_results

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=0.7, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
n_estimators: 200    learning_rate: 0.05    max_depth: 6
train_roc: 0.832    validation_roc: 0.766


Unnamed: 0,param_n_estimators,param_learning_rate,param_max_depth,mean_test_score,mean_train_score,std_test_score
0,200,0.05,6,0.821752,0.834573,0.000731


In [55]:
X_train.head(2)

Unnamed: 0,SK_ID_CURR,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,OWN_CAR_AGE,FLAG_WORK_PHONE,FLAG_PHONE,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,...,DPD3_1YR,DPD4_1YR,DPD5_1YR,DPDany_1YR,DPD1_overall,DPD2_overall,DPD3_overall,DPD4_overall,DPD5_overall,DPDany_overall
1,100003,1.180505,1.72545,0.592683,1.600873,-0.146313,-0.499013,1.599337,-2.051813,-0.291208,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,-1.13347,-1.152888,-1.404669,-1.092145,2.242932,2.003956,1.599337,-0.062699,-0.291208,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
X_test.head(2)

Unnamed: 0,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,OWN_CAR_AGE,FLAG_WORK_PHONE,FLAG_PHONE,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,EXT_SOURCE_1,...,DPD3_1YR,DPD4_1YR,DPD5_1YR,DPDany_1YR,DPD1_overall,DPD2_overall,DPD3_overall,DPD4_overall,DPD5_overall,DPDany_overall
0,-0.427809,0.142475,-0.55358,-0.037477,-0.138436,-0.507337,-0.597571,-0.024421,-0.289777,1.595754,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.782413,-0.804537,-0.752831,-0.839362,-0.138436,-0.507337,-0.597571,-0.024421,-0.289777,0.393719,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# Kaggle STOCHASTIC GRADIENT BOOSTING
# upscale
X_train = app_bur_train.copy()
X_train_0 = X_train[X_train.TARGET == 0]
X_train_1 = X_train[X_train.TARGET == 1]
X_train_1 = X_train_1.sample(X_train_0.shape[0], replace=True)
X_train = pd.concat([X_train_0, X_train_1], axis = 0)
y_train = X_train.TARGET

X_train = X_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
X_test = app_bur_test.drop('SK_ID_CURR', axis = 1)


# Model fit and prediction
model = ensemble.GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.05, max_depth = 6, subsample=0.7)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)

In [65]:
model.classes_

array([0, 1], dtype=int64)

In [67]:
sol = pd.DataFrame({'SK_ID_CURR': app_bur_test.SK_ID_CURR, 'TARGET': y_pred[:,1]})
sol.to_csv('./soln/sol_bureaugb.csv', index = False)                   # kaggle: 0.72664