In [34]:
import pandas as pd
from patsy import dmatrices
import datetime
import numpy as np
from sklearn.cross_validation import train_test_split
import sklearn.metrics
import matplotlib.pyplot as plt

In [35]:
loans = pd.read_pickle( 'C:\lc data\lc.pickle' )

In [36]:
loans.sub_grade = loans.sub_grade.astype('category')
loans.home_ownership = loans.home_ownership.astype('category')
loans['installment_over_inc'] = loans.installment / loans.annual_inc

In [37]:
newest = datetime.datetime.now() - datetime.timedelta( weeks=4*14 )
oldest = datetime.datetime(2010,3,1)
loans = loans[ (loans.issue_d > oldest) & (loans.issue_d < newest) ]

In [38]:
loans.columns

Index(['member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term',
       'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'fico_range_low', 'fico_range_high', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'last_fico_range_high', 'last_fico_range_low',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'annual_inc_joint', 'dt

In [39]:
y, X = dmatrices('bad ~ + loan_amnt + int_rate + installment + emp_length +\
                 C(home_ownership)+ C(grade) + C(purpose) + zip_code +\
                 revol_bal + revol_util+ total_acc + open_acc + dti + delinq_2yrs + fico_range_low + annual_inc + installment_over_inc',
                 loans, return_type='dataframe')

In [40]:
X.columns

Index(['Intercept', 'C(home_ownership)[T.MORTGAGE]',
       'C(home_ownership)[T.NONE]', 'C(home_ownership)[T.OTHER]',
       'C(home_ownership)[T.OWN]', 'C(home_ownership)[T.RENT]',
       'C(grade)[T.B]', 'C(grade)[T.C]', 'C(grade)[T.D]', 'C(grade)[T.E]',
       'C(grade)[T.F]', 'C(grade)[T.G]', 'C(purpose)[T.credit_card]',
       'C(purpose)[T.debt_consolidation]', 'C(purpose)[T.educational]',
       'C(purpose)[T.home_improvement]', 'C(purpose)[T.house]',
       'C(purpose)[T.major_purchase]', 'C(purpose)[T.medical]',
       'C(purpose)[T.moving]', 'C(purpose)[T.other]',
       'C(purpose)[T.renewable_energy]', 'C(purpose)[T.small_business]',
       'C(purpose)[T.vacation]', 'C(purpose)[T.wedding]', 'loan_amnt',
       'int_rate', 'installment', 'emp_length', 'zip_code', 'revol_bal',
       'revol_util', 'total_acc', 'open_acc', 'dti', 'delinq_2yrs',
       'fico_range_low', 'annual_inc', 'installment_over_inc'],
      dtype='object')

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=51)

# Random forest

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
y_train = np.ravel(y_train)

In [178]:
clf_grid = RandomForestClassifier( n_jobs=-1 )
clf = RandomForestClassifier( n_jobs=-1, class_weight={1.0: .99, 0.0:.01})

### Default model

In [179]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight={0.0: 0.01, 1.0: 0.99},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [180]:
y_predict = clf.predict(X_test)
print(sklearn.metrics.classification_report(y_test, y_predict, digits=3))

             precision    recall  f1-score   support

        0.0      0.913     0.999     0.954    204074
        1.0      0.212     0.004     0.008     19503

avg / total      0.852     0.912     0.871    223577



### Grid Search

In [106]:
param_grid = {"max_depth": [3,9, None],
              "max_features": [10, 15, None],
              "bootstrap": [True, False],
}

# run grid search
grid_search = sklearn.grid_search.GridSearchCV(clf_grid, param_grid=param_grid)

In [107]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'bootstrap': [True, False], 'max_features': [10, 15, None], 'max_depth': [3, 9, None]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [108]:
grid_search.best_params_

{'bootstrap': True, 'max_depth': 3, 'max_features': 10}

In [109]:
grid_search.grid_scores_

[mean: 0.91367, std: 0.00000, params: {'bootstrap': True, 'max_features': 10, 'max_depth': 3},
 mean: 0.91367, std: 0.00000, params: {'bootstrap': True, 'max_features': 15, 'max_depth': 3},
 mean: 0.91367, std: 0.00000, params: {'bootstrap': True, 'max_features': None, 'max_depth': 3},
 mean: 0.91366, std: 0.00000, params: {'bootstrap': True, 'max_features': 10, 'max_depth': 9},
 mean: 0.91366, std: 0.00001, params: {'bootstrap': True, 'max_features': 15, 'max_depth': 9},
 mean: 0.91360, std: 0.00003, params: {'bootstrap': True, 'max_features': None, 'max_depth': 9},
 mean: 0.91186, std: 0.00004, params: {'bootstrap': True, 'max_features': 10, 'max_depth': None},
 mean: 0.91130, std: 0.00007, params: {'bootstrap': True, 'max_features': 15, 'max_depth': None},
 mean: 0.91027, std: 0.00013, params: {'bootstrap': True, 'max_features': None, 'max_depth': None},
 mean: 0.91367, std: 0.00000, params: {'bootstrap': False, 'max_features': 10, 'max_depth': 3},
 mean: 0.91367, std: 0.00000, para

In [110]:
y_predict = grid_search.predict(X_test)
print(sklearn.metrics.classification_report(y_test, y_predict))

             precision    recall  f1-score   support

        0.0       0.91      1.00      0.95    204074
        1.0       0.00      0.00      0.00     19503

avg / total       0.83      0.91      0.87    223577



  'precision', 'predicted', average, warn_for)


Random forest by itself didn't do well at all for finding bad loans

Then after using grid search, the model just predicts all 0's.  Predicting 0's is apparently safer than trying to mark bad loans. Maybe I need a way to not penalize false positives as much. It's better to have a loan that's more likely to be a safe one

Feature weights from the Random Forest Classifier

In [111]:
features = sorted(list(zip(X.columns, model.feature_importances_)), key=lambda x: x[1],
      reverse=True)
features[:100]

[('revol_bal', 0.085271510614202933),
 ('installment_over_inc', 0.084734552790032169),
 ('dti', 0.084438152246624557),
 ('revol_util', 0.081596574659161134),
 ('zip_code', 0.080482910476034467),
 ('int_rate', 0.079466822412024546),
 ('installment', 0.076943626964798645),
 ('annual_inc', 0.070355958700014237),
 ('total_acc', 0.065253107583653089),
 ('loan_amnt', 0.06249826916714768),
 ('open_acc', 0.055051927028193705),
 ('fico_range_low', 0.04959524328557275),
 ('emp_length', 0.039176557339598808),
 ('delinq_2yrs', 0.016949855398056091),
 ('C(purpose)[T.debt_consolidation]', 0.0086238885458722291),
 ('C(home_ownership)[T.RENT]', 0.0056941410301401965),
 ('C(purpose)[T.credit_card]', 0.0054413257929738206),
 ('C(home_ownership)[T.MORTGAGE]', 0.0053887980340395136),
 ('C(home_ownership)[T.OWN]', 0.0047349336431846915),
 ('C(purpose)[T.other]', 0.0042223437790892843),
 ('C(purpose)[T.home_improvement]', 0.0041733147176340069),
 ('C(grade)[T.E]', 0.003977486797407075),
 ('C(grade)[T.D]', 0

# Logistic Regression

In [63]:
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression(penalty='l2', verbose=1, max_iter=20, solver='sag', n_jobs=-1)
fit = model.fit(X_train, y_train)
model.score(X_train, y_train)

max_iter reached after 6 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.2s finished


0.89261047710419328

About the same result, need to see what the hits/misses/false hits were