In [1]:
import pandas as pd
import numpy as np
import pickle

from matplotlib import pyplot

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score 

from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization

  from numpy.core.umath_tests import inner1d


## Set random number seed, read in final dataset (1293 variables initially but reduced to 961)

In [2]:
seed=123
X=pd.read_pickle('loan data_961.pkl')
Y=X.join(pd.read_csv("application_train.csv")[['SK_ID_CURR','TARGET']].set_index('SK_ID_CURR').TARGET,how='left').TARGET

## Replace all na type values (inf,-inf,blanks) with the columns mean.  This is done because random forest cannot handle NA values.

In [3]:
X.replace(float('-inf'),np.nan,inplace=True)
X.replace(float('inf'), np.nan,inplace=True)
X.fillna(X.mean(),inplace=True)

## Create train & test datasets

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,random_state=seed)

## Run 30 iterations of bayesian optimization to find hyperparameters. (less iterations for this model as there is a smaller solution space vs. lightgbm.

In [7]:
def bayes_parameter_opt_lgb(X_train, Y_train, init_round, opt_round, n_folds, random_seed):
    
    
    def lgb_eval(n_estimators, max_depth, min_samples_leaf):
        
        params = {'random_state':seed, 'n_jobs':-1}
        #n_estimators=int(round(n_estimators))
        #max_depth=int(round(max_depth))
        #min_samples_leaf=int(round(min_samples_leaf))
        params['n_estimators'] = int(round(n_estimators))
        params['max_depth'] = int(round(max_depth))
        params['min_samples_leaf'] = int(round(min_samples_leaf))
        
        model=RandomForestClassifier(**params)
        cv_result=cross_val_score(model, X_train, Y_train, scoring='roc_auc',cv=n_folds,verbose=100)
        
        return np.mean(cv_result)
    
    rfBO = BayesianOptimization(lgb_eval, {'n_estimators': (500,3000),
                                           'max_depth': (15,60),
                                           'min_samples_leaf': (2,6)}, random_state=random_seed)
    
    rfBO.maximize(init_points=init_round, n_iter=opt_round)
    
    print(rfBO.max)
    return rfBO

rfBO = bayes_parameter_opt_lgb(X_train, Y_train, init_round=10, opt_round=20, n_folds=3, random_seed=seed)

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
[CV]  ................................................................
[CV] ....................................... , score=0.752409 - 4.6min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.6min remaining:    0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.751113 - 4.7min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  9.3min remaining:    0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.752553 - 4.6min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 14.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 14.0min finished
| [0m 1       [0m | [0m 0.752   [0m | [0m 46.34   [0m | [0m 3.145   [0m | [0m 2.227e+0[0m |
[CV]  ................................

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 18.8min finished
| [0m 10      [0m | [0m 0.7521  [0m | [0m 59.78   [0m | [0m 2.672   [0m | [0m 2.997e+0[0m |
[CV]  ................................................................
[CV] ....................................... , score=0.752641 - 5.9min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min remaining:    0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.751257 - 6.0min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 12.0min remaining:    0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.752345 - 6.0min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 18.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 18.0min finished
| [0m 11      [0m | [0m 0.7521  [0m | [0m 59.86   [0m | [0m 5.081   [0m | [0m 2.999

KeyboardInterrupt: 

## Convert necessary paramaters to integer format

In [44]:
rf_params=rfBO.max
rf_params['params']['n_estimators']=int(rf_params['params']['n_estimators'])
rf_params['params']['max_depth']=int(rf_params['params']['max_depth'])
rf_params['params']['min_samples_leaf']=int(rf_params['params']['min_samples_leaf'])

## Model performance for test data

In [5]:
model=RandomForestClassifier(random_state=seed,n_jobs=-1,n_estimators=2000,max_depth=15,min_samples_leaf=5)
#model.set_params(**rf_params['params'])

model = model.fit(X_train,Y_train)
predictions = model.predict(X_test)

print(roc_auc_score(Y_test,pd.DataFrame(model.predict_proba(X_test))[1]))
print(classification_report(Y_test, predictions))

0.7619858091882127
             precision    recall  f1-score   support

          0       0.69      0.69      0.69      6229
          1       0.69      0.69      0.69      6184

avg / total       0.69      0.69      0.69     12413



## Model performance for train data

In [7]:
predictions = model.predict(X_train)
print(roc_auc_score(Y_train,pd.DataFrame(model.predict_proba(X_train))[1]))
print(classification_report(Y_train, predictions))

0.9880436391683465
             precision    recall  f1-score   support

          0       0.95      0.94      0.94     18596
          1       0.94      0.95      0.94     18641

avg / total       0.94      0.94      0.94     37237



## Save results for later ROC Curve plotting

In [8]:
pd.DataFrame(model.predict_proba(X_test))[1].to_csv('rf_results.csv')