In [92]:
import pandas as pd
from patsy import dmatrices
import datetime
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [93]:
loans = pd.read_pickle( 'C:\lc data\lc.pickle' )

In [94]:
loans.sub_grade = loans.sub_grade.astype('category')
loans.home_ownership = loans.home_ownership.astype('category')

In [95]:
newest = datetime.datetime.now() - datetime.timedelta( weeks=4*14 )
oldest = datetime.datetime(2010,3,1)
loans = loans[ (loans.issue_d > oldest) & (loans.issue_d < newest) ]

In [96]:
feature_cols = ['bad', 'loan_amnt','int_rate', 'installment', 'emp_length',
            'home_ownership', 'sub_grade', 'purpose', 'zip_code',
            'pub_rec', 'revol_bal', 'open_acc', 'delinq_2yrs',
            'fico_range_low', 'annual_inc']
loans=loans[feature_cols]

In [97]:
y, X = dmatrices('bad ~ + loan_amnt + int_rate + installment + emp_length +\
                 C(home_ownership)+ C(sub_grade) + C(purpose) + zip_code +\
                 pub_rec + revol_bal + open_acc + delinq_2yrs + fico_range_low + annual_inc',
                 loans, return_type='dataframe')

In [105]:
X.columns

Index(['Intercept', 'C(home_ownership)[T.MORTGAGE]',
       'C(home_ownership)[T.NONE]', 'C(home_ownership)[T.OTHER]',
       'C(home_ownership)[T.OWN]', 'C(home_ownership)[T.RENT]',
       'C(sub_grade)[T.A2]', 'C(sub_grade)[T.A3]', 'C(sub_grade)[T.A4]',
       'C(sub_grade)[T.A5]', 'C(sub_grade)[T.B1]', 'C(sub_grade)[T.B2]',
       'C(sub_grade)[T.B3]', 'C(sub_grade)[T.B4]', 'C(sub_grade)[T.B5]',
       'C(sub_grade)[T.C1]', 'C(sub_grade)[T.C2]', 'C(sub_grade)[T.C3]',
       'C(sub_grade)[T.C4]', 'C(sub_grade)[T.C5]', 'C(sub_grade)[T.D1]',
       'C(sub_grade)[T.D2]', 'C(sub_grade)[T.D3]', 'C(sub_grade)[T.D4]',
       'C(sub_grade)[T.D5]', 'C(sub_grade)[T.E1]', 'C(sub_grade)[T.E2]',
       'C(sub_grade)[T.E3]', 'C(sub_grade)[T.E4]', 'C(sub_grade)[T.E5]',
       'C(sub_grade)[T.F1]', 'C(sub_grade)[T.F2]', 'C(sub_grade)[T.F3]',
       'C(sub_grade)[T.F4]', 'C(sub_grade)[T.F5]', 'C(sub_grade)[T.G1]',
       'C(sub_grade)[T.G2]', 'C(sub_grade)[T.G3]', 'C(sub_grade)[T.G4]',
       'C(sub_

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=51)

# Random forest

In [99]:
from sklearn.ensemble import RandomForestClassifier
y_train = np.ravel(y_train)
model = RandomForestClassifier( n_jobs=-1, verbose=1, n_estimators=30)
fit = model.fit(X_train, y_train)
model.score(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   13.6s finished
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    1.1s finished


0.99693138730342179

In [100]:
model.score(X_test, y_test)

[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.7s finished


0.89170459918232636

In [101]:
y_predict = model.predict(X_train)
y_predict

[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    1.1s finished


array([ 0.,  1.,  0., ...,  0.,  0.,  0.])

In [102]:
features = sorted(list(zip(X.columns, model.feature_importances_)), key=lambda x: x[1],
      reverse=True)
features[:100]

[('revol_bal', 0.1196686870819821),
 ('zip_code', 0.11263956254936204),
 ('installment', 0.10810593046763932),
 ('annual_inc', 0.10366158885520955),
 ('loan_amnt', 0.090810798009025639),
 ('int_rate', 0.090014821567025297),
 ('open_acc', 0.082268941064315718),
 ('fico_range_low', 0.069214837912805124),
 ('emp_length', 0.057736593998183539),
 ('delinq_2yrs', 0.022915991453476236),
 ('pub_rec', 0.012968595799235826),
 ('C(purpose)[T.debt_consolidation]', 0.010789505856084264),
 ('C(home_ownership)[T.MORTGAGE]', 0.0066786897655230363),
 ('C(home_ownership)[T.RENT]', 0.0064402433749478072),
 ('C(purpose)[T.credit_card]', 0.006250520975099357),
 ('C(home_ownership)[T.OWN]', 0.0056178721457846997),
 ('C(purpose)[T.other]', 0.0053398511472332695),
 ('C(purpose)[T.home_improvement]', 0.0050770696977306866),
 ('C(sub_grade)[T.C5]', 0.0039338957063984456),
 ('C(sub_grade)[T.C3]', 0.0038312084622630954),
 ('C(sub_grade)[T.C4]', 0.0038181013425464322),
 ('C(sub_grade)[T.D1]', 0.0035726606295117575

In [104]:
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_predict)
roc_auc = auc(false_positive_rate, true_positive_rate)

plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

ValueError: Found arrays with inconsistent numbers of samples: [215734 323599]

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression(penalty='l2', verbose=1, max_iter=50, solver='sag', n_jobs=-1)
fit = model.fit(X_train, y_train)
model.score(X_train, y_train)

About the same result, need to see what the hits/misses/false hits were

In [None]:
false_positive_rate

In [None]:
true_positive_rate

In [None]:
thresholds