In [1]:
import pandas as pd
import numpy as np
import pickle


from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from bayes_opt import BayesianOptimization

## Set random number seed, read in final dataset

In [2]:
seed=123
X=pd.read_pickle('loan data_961.pkl')
Y=X.join(pd.read_csv("application_train.csv")[['SK_ID_CURR','TARGET']].set_index('SK_ID_CURR').TARGET,how='left').TARGET

## Replace all na type values (inf,-inf,blanks) with the columns mean.  This is done because logistic regression cannot handle NA values.

In [3]:
X.replace(float('-inf'),np.nan,inplace=True)
X.replace(float('inf'), np.nan,inplace=True)
X.fillna(X.mean(),inplace=True)

## Create train & test datasets

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,random_state=seed)

## Run gridsearch to optimize C hyperparameter for AUC, with 3 fold cross validation

In [7]:
param_grid = {'C': [.001,.01,1,10,100,1000]}
lrCV = GridSearchCV(LogisticRegression(random_state=seed), param_grid,scoring='roc_auc',cv=3,verbose=25)

lrCV.fit(X_train,Y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.001 .........................................................
[CV] ................. C=0.001, score=0.623250279733543, total= 1.2min
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


[CV] ................ C=0.001, score=0.6076342919601483, total=  54.9s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.1min remaining:    0.0s


[CV] ................ C=0.001, score=0.6271042090959701, total= 1.1min
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.2min remaining:    0.0s


[CV] ................. C=0.01, score=0.6237210410038934, total= 1.3min
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.5min remaining:    0.0s


[CV] ................. C=0.01, score=0.6048656424904855, total=  48.4s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.3min remaining:    0.0s


[CV] ................. C=0.01, score=0.6320123618429686, total= 1.6min
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  6.9min remaining:    0.0s


[CV] .................... C=1, score=0.6219533628070975, total=  55.0s
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  7.8min remaining:    0.0s


[CV] .................... C=1, score=0.6073335436797354, total=  47.5s
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  8.6min remaining:    0.0s


[CV] .................... C=1, score=0.6298626104681049, total=  58.3s
[CV] C=10 ............................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  9.6min remaining:    0.0s


[CV] ................... C=10, score=0.6158320644447102, total=  59.2s
[CV] C=10 ............................................................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 10.6min remaining:    0.0s


[CV] ................... C=10, score=0.6080064046793057, total=  51.8s
[CV] C=10 ............................................................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed: 11.4min remaining:    0.0s


[CV] ................... C=10, score=0.6322902249273101, total= 1.4min
[CV] C=100 ...........................................................


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 12.8min remaining:    0.0s


[CV] .................. C=100, score=0.6167052079633472, total=  49.3s
[CV] C=100 ...........................................................


[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed: 13.6min remaining:    0.0s


[CV] ................... C=100, score=0.607858743374257, total=  54.8s
[CV] C=100 ...........................................................


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed: 14.5min remaining:    0.0s


[CV] .................. C=100, score=0.6326873873583307, total= 1.7min
[CV] C=1000 ..........................................................


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 16.2min remaining:    0.0s


[CV] ................. C=1000, score=0.6138774212832587, total=  48.7s
[CV] C=1000 ..........................................................


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed: 17.0min remaining:    0.0s


[CV] ................. C=1000, score=0.6087234498457527, total=  56.8s
[CV] C=1000 ..........................................................


[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed: 18.0min remaining:    0.0s


[CV] ................. C=1000, score=0.6323531726017443, total= 1.3min


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 19.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 19.2min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=25)

In [10]:
lrCV.best_params_,lrCV.best_score_

({'C': 0.01}, 0.620199047319838)

## Model Results

In [5]:
model=LogisticRegression(random_state=seed,C=.01)
#model.set_params(**lrCV.best_params_)

model = model.fit(X_train,Y_train)
predictions = model.predict(X_test)

print(roc_auc_score(Y_test,pd.DataFrame(model.predict_proba(X_test))[1]))
print(classification_report(Y_test, predictions))

0.6172319329298318
             precision    recall  f1-score   support

          0       0.57      0.71      0.63      6229
          1       0.61      0.46      0.53      6184

avg / total       0.59      0.59      0.58     12413



In [6]:
predictions = model.predict(X_train)

print(roc_auc_score(Y_train,pd.DataFrame(model.predict_proba(X_train))[1]))
print(classification_report(Y_train, predictions))

0.6230971491787134
             precision    recall  f1-score   support

          0       0.57      0.70      0.63     18596
          1       0.61      0.47      0.53     18641

avg / total       0.59      0.59      0.58     37237



## Save results for later ROC Curve plotting

In [7]:
pd.DataFrame(model.predict_proba(X_test))[1].to_csv('lr_results.csv')