# Credit worthiness model

Import libraries

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from patsy import dmatrices
from sklearn import metrics

Load pickle and add intercept

In [2]:
df = pd.read_pickle('Z:\PMO\Stephen Bowie\Projects\Underwriting\Model\Cleaned Model Data')

Create dummy columns and intercept

In [3]:
y, X = dmatrices('repo_in_12 ~ brand + poi + mo_pmt + amt_fin + call + auto_hist + emp_length + annual_inc + ltv + down + def_down + acv + down + vdc + trade_roll + pack + gross_profit + purch_type + credit_score',
                 df, return_type='dataframe')
X.columns

Index([u'Intercept', u'brand[T.UP]', u'poi[T.y]', u'call[T.c2000]',
       u'call[T.c1500]', u'call[T.green]', u'call[T.blue]', u'call[T.silver]',
       u'call[T.high]', u'auto_hist[T.pay on time]', u'auto_hist[T.late pay]',
       u'auto_hist[T.repo]', u'purch_type[T.simulcast]', u'purch_type[T.repo]',
       u'purch_type[T.trade]', u'mo_pmt', u'amt_fin', u'emp_length',
       u'annual_inc', u'ltv', u'down', u'def_down', u'acv', u'vdc',
       u'trade_roll', u'pack', u'gross_profit', u'credit_score'],
      dtype='object')

In [4]:
X = X.rename(columns = {'auto_hist[T.pay on time]': 'auto_pay',
                        'auto_hist[T.late pay]': 'auto_late',
                        'auto_hist[T.repo]': 'auto_repo',
                       'purch_type[T.simulcast]': 'pt_simul',
                       'purch_type[T.repo]': 'pt_repo',
                       'purch_type[T.trade]': 'pt_trade',
                       'call[T.c2000]': 'call_c2000',
                       'call[T.c1500]': 'call_c1500',
                       'call[T.green]': 'call_green',
                       'call[T.blue]': 'call_blue',
                       'call[T.silver]': 'call_silver',
                       'call[T.high]': 'call_high',
                       'brand[T.UP]': 'brand_up',
                       'poi[T.y]': 'has_poi',})

Set variables for logistic regression

In [5]:
X.describe()

Unnamed: 0,Intercept,brand_up,has_poi,call_c2000,call_c1500,call_green,call_blue,call_silver,call_high,auto_pay,...,annual_inc,ltv,down,def_down,acv,vdc,trade_roll,pack,gross_profit,credit_score
count,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0,...,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0,4262.0
mean,1.0,0.14946,0.448146,0.176678,0.201549,0.129517,0.133975,0.15298,0.130221,0.197794,...,27712.890906,1.534538,897.890819,303.936469,10702.545753,1559.208588,765.847717,3723.967621,2780.822841,570.018067
std,0.0,0.356583,0.497362,0.38144,0.401204,0.33581,0.340665,0.36001,0.336585,0.398383,...,14860.760635,0.219125,910.31145,467.485361,3117.865491,970.670188,1793.468033,802.500845,1325.258959,74.711655
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.135147,0.0,0.0,2195.0,0.0,-11500.0,590.0,-6326.57,59.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,19500.0,1.406731,400.0,0.0,8895.0,999.0,0.0,2950.0,1869.27,518.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24960.0,1.551626,500.0,200.0,10495.0,2299.0,0.0,4200.0,3168.43,565.0
75%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,33280.0,1.662876,1000.0,500.0,12495.0,2299.0,0.0,4200.0,3668.43,618.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,221000.0,2.728847,12900.0,13182.14,25595.0,2299.0,11168.47,4200.0,8469.27,831.0


In [6]:
y = np.ravel(y)

Need to incorporate:
- Explicitly handle missing values (in credit score): two separate models for credit score, ignore records with missing purchase type
- Explicitly address/handle outliers (id outliers, model with, model without, compare performance)
- Feature scaling (sklearn preprocessing module)
- Feature evaluation/significance, colinearity test, and selection
- Conditional features (e.g. proof of income should emphasize or add weight to annual income)
- train/test split data
- Pipeline?

Logistic regression model

In [7]:
model = LogisticRegression()
model = model.fit(X,y)

In [8]:
#error rate
error_rate = 1 - model.score(X,y)
null_error_rate = y.mean()
print 'error rate: ', error_rate
print 'null error rate: ', null_error_rate
print 'improvement: ', null_error_rate - error_rate

error rate:  0.273815110277
null error rate:  0.281792585641
improvement:  0.00797747536368


In [14]:
#examine the coefficients
pd.DataFrame(zip(X.columns, np.transpose(model.coef_))).sort_values(1,ascending=False)

Unnamed: 0,0,1
15,mo_pmt,[0.0064762037201]
13,pt_repo,[0.003211462593]
4,call_c1500,[0.00245001869898]
3,call_c2000,[0.00233875281347]
11,auto_repo,[0.00157664849252]
19,ltv,[0.000916010883757]
0,Intercept,[0.000602992610607]
1,brand_up,[0.00049006030394]
26,gross_profit,[0.000332465155133]
23,vdc,[0.000306191106334]


In [10]:
# predict class labels for the test set
predicted = model.predict(X)
print predicted

[ 0.  0.  0. ...,  0.  0.  0.]


In [11]:
# generate class probabilities
probs = model.predict_proba(X)
print probs

[[ 0.83552313  0.16447687]
 [ 0.66665161  0.33334839]
 [ 0.69551661  0.30448339]
 ..., 
 [ 0.78154341  0.21845659]
 [ 0.85727757  0.14272243]
 [ 0.88041441  0.11958559]]


In [12]:
print metrics.confusion_matrix(y, predicted)
print metrics.classification_report(y, predicted)

[[2979   82]
 [1085  116]]
             precision    recall  f1-score   support

        0.0       0.73      0.97      0.84      3061
        1.0       0.59      0.10      0.17      1201

avg / total       0.69      0.73      0.65      4262

