In [49]:
import pandas as pd
from patsy import dmatrices
import datetime
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [50]:
loans = pd.read_pickle( 'C:\lc data\lc.pickle' )

In [51]:
loans.sub_grade = loans.sub_grade.astype('category')
loans.home_ownership = loans.home_ownership.astype('category')
loans['installment_over_inc'] = loans.installment / loans.annual_inc

In [52]:
newest = datetime.datetime.now() - datetime.timedelta( weeks=4*14 )
oldest = datetime.datetime(2010,3,1)
loans = loans[ (loans.issue_d > oldest) & (loans.issue_d < newest) ]

In [53]:
loans.columns

Index(['member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term',
       'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'fico_range_low', 'fico_range_high', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'last_fico_range_high', 'last_fico_range_low',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'annual_inc_joint', 'dt

In [54]:
y, X = dmatrices('bad ~ + loan_amnt + int_rate + installment + emp_length +\
                 C(home_ownership)+ C(sub_grade) + C(purpose) + zip_code +\
                 revol_bal + revol_util+ total_acc + open_acc + dti + delinq_2yrs + fico_range_low + annual_inc + installment_over_inc',
                 loans, return_type='dataframe')

In [55]:
X.columns

Index(['Intercept', 'C(home_ownership)[T.MORTGAGE]',
       'C(home_ownership)[T.NONE]', 'C(home_ownership)[T.OTHER]',
       'C(home_ownership)[T.OWN]', 'C(home_ownership)[T.RENT]',
       'C(sub_grade)[T.A2]', 'C(sub_grade)[T.A3]', 'C(sub_grade)[T.A4]',
       'C(sub_grade)[T.A5]', 'C(sub_grade)[T.B1]', 'C(sub_grade)[T.B2]',
       'C(sub_grade)[T.B3]', 'C(sub_grade)[T.B4]', 'C(sub_grade)[T.B5]',
       'C(sub_grade)[T.C1]', 'C(sub_grade)[T.C2]', 'C(sub_grade)[T.C3]',
       'C(sub_grade)[T.C4]', 'C(sub_grade)[T.C5]', 'C(sub_grade)[T.D1]',
       'C(sub_grade)[T.D2]', 'C(sub_grade)[T.D3]', 'C(sub_grade)[T.D4]',
       'C(sub_grade)[T.D5]', 'C(sub_grade)[T.E1]', 'C(sub_grade)[T.E2]',
       'C(sub_grade)[T.E3]', 'C(sub_grade)[T.E4]', 'C(sub_grade)[T.E5]',
       'C(sub_grade)[T.F1]', 'C(sub_grade)[T.F2]', 'C(sub_grade)[T.F3]',
       'C(sub_grade)[T.F4]', 'C(sub_grade)[T.F5]', 'C(sub_grade)[T.G1]',
       'C(sub_grade)[T.G2]', 'C(sub_grade)[T.G3]', 'C(sub_grade)[T.G4]',
       'C(sub_

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=51)

# Random forest

In [57]:
from sklearn.ensemble import RandomForestClassifier
y_train = np.ravel(y_train)
model = RandomForestClassifier( n_jobs=-1, verbose=1, n_estimators=25)
fit = model.fit(X_train, y_train)
model.score(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   16.2s finished
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    1.2s finished


0.99711914088381681

In [58]:
model.score(X_test, y_test)

[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.5s finished


0.88938085932668764

In [59]:
y_predict = model.predict_proba(X_test)
y_predict

[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.5s finished


array([[ 0.76,  0.24],
       [ 0.92,  0.08],
       [ 1.  ,  0.  ],
       ..., 
       [ 0.84,  0.16],
       [ 0.92,  0.08],
       [ 0.88,  0.12]])

In [60]:
r = pd.DataFrame(pd.Series(y_predict.T[1], y_test))
r.reset_index(inplace=True)
r.columns = ['bad', 'prob']
r.groupby('prob').agg([len, sum, np.mean ])

Unnamed: 0_level_0,bad,bad,bad
Unnamed: 0_level_1,len,sum,mean
prob,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0.0,22491.0,1039.0,0.046196
0.04,30046.0,2126.0,0.070758
0.08,29072.0,2650.0,0.091153
0.12,24608.0,2796.0,0.113622
0.16,18915.0,2532.0,0.133862
0.2,13588.0,2124.0,0.156314
0.24,9235.0,1567.0,0.169681
0.28,6107.0,1211.0,0.198297
0.32,3570.0,748.0,0.209524
0.36,2042.0,484.0,0.237023


In [61]:
r[r.prob <= .1].describe()

Unnamed: 0,bad,prob
count,81609.0,81609.0
mean,0.071254,0.043226
std,0.257251,0.031631
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.04
75%,0.0,0.08
max,1.0,0.08


In [62]:
features = sorted(list(zip(X.columns, model.feature_importances_)), key=lambda x: x[1],
      reverse=True)
features[:100]

[('installment_over_inc', 0.07949931815778663),
 ('revol_bal', 0.079206222400730969),
 ('dti', 0.078719596928310459),
 ('revol_util', 0.077974464535387117),
 ('zip_code', 0.075303940630520497),
 ('installment', 0.072854009577129147),
 ('int_rate', 0.072194392145975464),
 ('annual_inc', 0.067625349800669671),
 ('total_acc', 0.063821868606092791),
 ('loan_amnt', 0.060110958507397394),
 ('open_acc', 0.053384613566271222),
 ('fico_range_low', 0.047557348738037561),
 ('emp_length', 0.039863374226105551),
 ('delinq_2yrs', 0.015551206065798472),
 ('C(purpose)[T.debt_consolidation]', 0.009383723913835882),
 ('C(purpose)[T.credit_card]', 0.0062504135961279151),
 ('C(home_ownership)[T.MORTGAGE]', 0.0053837217965769235),
 ('C(home_ownership)[T.RENT]', 0.005228112240369732),
 ('C(home_ownership)[T.OWN]', 0.0046024905060849941),
 ('C(purpose)[T.other]', 0.0043886457454209967),
 ('C(purpose)[T.home_improvement]', 0.0039667493153114006),
 ('C(sub_grade)[T.C5]', 0.0035194216441997639),
 ('C(sub_grade)

# Logistic Regression

In [63]:
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression(penalty='l2', verbose=1, max_iter=20, solver='sag', n_jobs=-1)
fit = model.fit(X_train, y_train)
model.score(X_train, y_train)

max_iter reached after 6 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.2s finished


0.89261047710419328

About the same result, need to see what the hits/misses/false hits were