In [1]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_pickle('cleaned_df.pkl')

## Preparation

## Categorical variables

In [5]:
from patsy import dmatrices

In [6]:
df.columns

Index(['total_pymnt', 'zip_code', 'member_id', 'id', 'loan_amnt', 'int_rate',
       'installment', 'emp_length', 'home_ownership', 'grade', 'sub_grade',
       'emp_title', 'issue_d', 'loan_status', 'annual_inc',
       'verification_status', 'purpose', 'addr_state', 'inq_last_6mths', 'dti',
       'revol_util', 'mths_since_last_delinq', 'pub_rec', 'revol_bal',
       'open_acc', 'collections_12_mths_ex_med', 'delinq_2yrs',
       'earliest_cr_line', 'fico_range_low', 'last_credit_pull_d',
       'ratio_inc_debt', 'ratio_inc_installment', 'ratio_mth_inc_all_payments',
       'year_issued', 'month_issued', 'delinq'],
      dtype='object')

In [7]:
y, X = dmatrices('delinq ~  + loan_amnt + int_rate + installment + emp_length +'
                 'C(home_ownership) + C(grade) + C(month_issued) + C(year_issued)'
                 '+ C(purpose) + C(addr_state) + inq_last_6mths + pub_rec + revol_bal +open_acc+'
                 'collections_12_mths_ex_med + delinq_2yrs + earliest_cr_line  + fico_range_low'
                 '+ ratio_mth_inc_all_payments + annual_inc',
                 df, return_type='dataframe')

In [8]:
X.columns

Index(['Intercept', 'C(home_ownership)[T.OTHER]', 'C(home_ownership)[T.OWN]',
       'C(home_ownership)[T.RENT]', 'C(grade)[T.B]', 'C(grade)[T.C]',
       'C(grade)[T.D]', 'C(grade)[T.E]', 'C(grade)[T.F]', 'C(grade)[T.G]',
       ...
       'inq_last_6mths', 'pub_rec', 'revol_bal', 'open_acc',
       'collections_12_mths_ex_med', 'delinq_2yrs', 'earliest_cr_line',
       'fico_range_low', 'ratio_mth_inc_all_payments', 'annual_inc'],
      dtype='object', length=104)

Earliest cr line to the year. Proabbly going to be quite good.

In [6]:
for i in X.columns:
    print(i,)

Intercept
C(home_ownership)[T.OTHER]
C(home_ownership)[T.OWN]
C(home_ownership)[T.RENT]
C(grade)[T.B]
C(grade)[T.C]
C(grade)[T.D]
C(grade)[T.E]
C(grade)[T.F]
C(grade)[T.G]
C(month_issued)[T.2]
C(month_issued)[T.3]
C(month_issued)[T.4]
C(month_issued)[T.5]
C(month_issued)[T.6]
C(month_issued)[T.7]
C(month_issued)[T.8]
C(month_issued)[T.9]
C(month_issued)[T.10]
C(month_issued)[T.11]
C(month_issued)[T.12]
C(year_issued)[T.2008]
C(year_issued)[T.2009]
C(year_issued)[T.2010]
C(year_issued)[T.2011]
C(year_issued)[T.2012]
C(year_issued)[T.2013]
C(year_issued)[T.2014]
C(purpose)[T.credit_card]
C(purpose)[T.debt_consolidation]
C(purpose)[T.educational]
C(purpose)[T.home_improvement]
C(purpose)[T.house]
C(purpose)[T.major_purchase]
C(purpose)[T.medical]
C(purpose)[T.moving]
C(purpose)[T.other]
C(purpose)[T.renewable_energy]
C(purpose)[T.small_business]
C(purpose)[T.vacation]
C(purpose)[T.wedding]
C(addr_state)[T.AL]
C(addr_state)[T.AR]
C(addr_state)[T.AZ]
C(addr_state)[T.CA]
C(addr_state)[T.CO]


In [154]:
y = np.ravel(y)

Create a small testset for testing purposes

In [None]:
y_s, X_s = y[-10000:], X[-10000:]

## A first model: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV

In [None]:
model_log = LogisticRegressionCV(cv=5, penalty='l2', verbose=1, max_iter=1000)

In [None]:
fit = model_log.fit(X_s, y_s)

In [None]:
predictions = model_log.predict(X_s)

In [None]:
predictions.mean()

In [None]:
model_log.score(X_s, y_s)

In [None]:
y_s.mean()

## Train_test_split

In [26]:
from sklearn.cross_validation import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
                                                   random_state=42)

## A Second Model: 

In [152]:
from sklearn.ensemble import RandomForestClassifier

In [155]:
model_rf = RandomForestClassifier(n_estimators=50, oob_score=True, verbose=1, random_state=2143,
                                 min_samples_split=50)

In [156]:
rf_fit = model_rf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    1.6s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.3min finished


In [157]:
prediction = rf_fit.predict(X_test)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.9s finished


In [158]:
prob_pred = rf_fit.predict_proba(X_test)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.9s finished


In [159]:
default_probabilities = [item[1] for item in prob_pred]

In [160]:
default_probabilities

[0.21334090619574103,
 0.31338246733506114,
 0.16143843462913246,
 0.0696714603666133,
 0.15040547653121877,
 0.095839268644744902,
 0.065725037568162031,
 0.056240467103940005,
 0.066372089155982458,
 0.017576535541752935,
 0.12727276582261329,
 0.14005938194634787,
 0.13224133258633275,
 0.2438762039378847,
 0.16517943883280961,
 0.19408451248914468,
 0.070524107997422605,
 0.25862181798172834,
 0.109820218364894,
 0.034593708233587803,
 0.15910222632625193,
 0.19727084390684635,
 0.18200658208430806,
 0.037104097705004217,
 0.034422017963399096,
 0.17984749151618268,
 0.097086162581718316,
 0.061888964146416786,
 0.0207003798746151,
 0.089631639093707791,
 0.018746288613030418,
 0.12389849692985139,
 0.12042606713331613,
 0.076856857881564211,
 0.044833118971692482,
 0.042550621153838002,
 0.081105162314306117,
 0.16534131904903557,
 0.019595748291998793,
 0.10580131602719003,
 0.019088381934792707,
 0.080541378899786076,
 0.15989573114943703,
 0.024107139883736028,
 0.0867474207002

## Steps

1. Standarize the values
2. multiply

In [16]:
features = sorted(list(zip(X.columns, model.feature_importances_)), key=lambda x: x[1],
      reverse=True)

In [17]:
features

[('int_rate', 0.088512753792724025),
 ('ratio_mth_inc_all_payments', 0.07949531665712653),
 ('revol_bal', 0.07208255166629067),
 ('annual_inc', 0.06393896633165698),
 ('installment', 0.063232541091892358),
 ('loan_amnt', 0.051850760726925883),
 ('earliest_cr_line', 0.044686208514743891),
 ('fico_range_low', 0.04025584707748097),
 ('open_acc', 0.04023243688030604),
 ('emp_length', 0.02378139493003692),
 ('inq_last_6mths', 0.021711821237300553),
 ('C(year_issued)[T.2014]', 0.020606316079308757),
 ('delinq_2yrs', 0.014397907621284227),
 ('C(year_issued)[T.2012]', 0.011002944707140101),
 ('C(grade)[T.E]', 0.0096943707114501496),
 ('pub_rec', 0.0092265747870874798),
 ('C(grade)[T.F]', 0.0077977713180124334),
 ('C(purpose)[T.small_business]', 0.007553897350545279),
 ('C(grade)[T.D]', 0.0072375660472647181),
 ('C(year_issued)[T.2013]', 0.0066411741853656843),
 ('C(grade)[T.B]', 0.00653594083270453),
 ('C(addr_state)[T.FL]', 0.0065131744696247437),
 ('C(home_ownership)[T.RENT]', 0.006444352766

In [161]:
features = sorted(list(zip(X.columns, rf_fit.feature_importances_)), key=lambda x: x[1],
      reverse=True)

In [162]:
values = [feature[1] for feature in features]

In [163]:
standardized_values =(values - np.mean(values)) / np.std(values)

In [164]:
base_hundred = (100/  standardized_values[0]) * standardized_values

In [166]:
hundred = list(zip([feature[0] for feature in features], base_hundred))

In [167]:
importand_featuers = hundred[:10]

In [168]:
importand_featuers

[('int_rate', 100.0),
 ('ratio_mth_inc_all_payments', 88.570674498247485),
 ('revol_bal', 79.175221813159808),
 ('annual_inc', 68.853476716274301),
 ('installment', 67.958104351985725),
 ('loan_amnt', 53.532046191055947),
 ('earliest_cr_line', 44.451195603911444),
 ('fico_range_low', 38.83584811709639),
 ('open_acc', 38.806176408877185),
 ('emp_length', 17.954984383333546)]

In [17]:
prob_pred.mean()

0.5

In [54]:
rf_fit.score(X_test, y_test)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.2s finished


0.89873045078196867

In [None]:
rf_full = model_rf.fit(X, y)

In [None]:
prediction = rf_full.predict(X)

In [9]:
tuples = list(zip(y, rf_fit.predict_proba(X)))

NameError: name 'rf_fit' is not defined

In [97]:
tuples

[(0.0, array([ 0.89656486,  0.10343514])),
 (1.0, array([ 0.66304684,  0.33695316])),
 (0.0, array([ 0.73805755,  0.26194245])),
 (0.0, array([ 0.83802716,  0.16197284])),
 (0.0, array([ 0.9145468,  0.0854532])),
 (0.0, array([ 0.87662657,  0.12337343])),
 (0.0, array([ 0.818842,  0.181158])),
 (0.0, array([ 0.78879857,  0.21120143])),
 (1.0, array([ 0.62657776,  0.37342224])),
 (1.0, array([ 0.66830516,  0.33169484])),
 (0.0, array([ 0.84829552,  0.15170448])),
 (0.0, array([ 0.90303472,  0.09696528])),
 (1.0, array([ 0.69287064,  0.30712936])),
 (0.0, array([ 0.80015427,  0.19984573])),
 (1.0, array([ 0.84500983,  0.15499017])),
 (0.0, array([ 0.69640697,  0.30359303])),
 (0.0, array([ 0.751328,  0.248672])),
 (0.0, array([ 0.95842638,  0.04157362])),
 (0.0, array([ 0.92821433,  0.07178567])),
 (0.0, array([ 0.96056859,  0.03943141])),
 (0.0, array([ 0.77836524,  0.22163476])),
 (1.0, array([ 0.83414748,  0.16585252])),
 (0.0, array([ 0.92152484,  0.07847516])),
 (0.0, array([ 0.8857

In [None]:
rf_full.score(X, y)

## Support Vector Machines

In [55]:
from sklearn import svm

In [56]:
model = svm.SVC(random_state=4134)

In [None]:
svm_fit = model.fit(X_test, y_test)

# Simple Random Forests

In [13]:
y, X = dmatrices('delinq ~ fico_range_low + pub_rec + revol_util +'
                'C(home_ownership)')

PatsyError: Error evaluating factor: NameError: name 'delinq' is not defined
    delinq ~ fico_range_low + pub_rec + revol_util +C(home_ownership)
    ^^^^^^


# Finding the average pay back in case of default

In [80]:
df.loan_status.value_counts()

Current               191303
Fully Paid            144478
Charged Off            33166
Late (31-120 days)      5748
In Grace Period         2872
Late (16-30 days)       1087
Default                  134
dtype: int64

In [84]:
df.columns

Index(['total_pymnt', 'zip_code', 'member_id', 'id', 'loan_amnt', 'int_rate',
       'installment', 'emp_length', 'home_ownership', 'grade', 'sub_grade',
       'emp_title', 'issue_d', 'loan_status', 'annual_inc',
       'verification_status', 'purpose', 'addr_state', 'inq_last_6mths', 'dti',
       'revol_util', 'mths_since_last_delinq', 'pub_rec', 'revol_bal',
       'open_acc', 'collections_12_mths_ex_med', 'delinq_2yrs',
       'earliest_cr_line', 'fico_range_low', 'last_credit_pull_d',
       'ratio_inc_debt', 'ratio_inc_installment', 'ratio_mth_inc_all_payments',
       'year_issued', 'month_issued', 'delinq'],
      dtype='object')

In [90]:
default_df = df.assign(percentage_paid= df.total_pymnt/df.loan_amnt
                      )[df.loan_status=='Charged Off']

In [92]:
default_df.percentage_paid.mean()

0.45161542363848606

# Dumping the model to pickle

In [21]:
df.home_ownership.value_counts()

MORTGAGE    192666
RENT        153173
OWN          32760
OTHER          189
dtype: int64

In [18]:
df.int_rate

0         0.1065
1         0.1527
2         0.1596
3         0.1349
4         0.1269
5         0.0790
6         0.1596
7         0.1864
8         0.2128
9         0.1269
10        0.1465
11        0.1269
12        0.1349
13        0.0991
14        0.1065
15        0.1629
16        0.1527
17        0.0603
18        0.1171
19        0.0603
20        0.1527
21        0.1242
22        0.1171
23        0.1171
24        0.1171
25        0.0991
26        0.1427
27        0.1677
28        0.1171
29        0.1171
           ...  
235599    0.1825
235600    0.1353
235601    0.1699
235602    0.0967
235603    0.2148
235604    0.1561
235605    0.1353
235606    0.1624
235607    0.1398
235608    0.1757
235609    0.0790
235610    0.0890
235611    0.1285
235612    0.0790
235613    0.2240
235614    0.1624
235615    0.2290
235616    0.1099
235617    0.2148
235618    0.1199
235619    0.1825
235620    0.1561
235621    0.0790
235622    0.1825
235623    0.1353
235624    0.1447
235625    0.1997
235626    0.07

In [173]:
rf.oob_score_

0.8977196349078016

In [76]:
from sklearn.externals import joblib
joblib.dump(rf_fit, 'model/rf_model.pkl');

In [14]:
rf = joblib.load('model/rf_model.pkl')

In [77]:
model = joblib.load('model/rf_model.pkl')

In [None]:
zip()

# Investing only in C, D, E ,G loans with lower default probabilities

In [178]:
X.columns

Index(['Intercept', 'C(home_ownership)[T.OTHER]', 'C(home_ownership)[T.OWN]',
       'C(home_ownership)[T.RENT]', 'C(grade)[T.B]', 'C(grade)[T.C]',
       'C(grade)[T.D]', 'C(grade)[T.E]', 'C(grade)[T.F]', 'C(grade)[T.G]',
       ...
       'inq_last_6mths', 'pub_rec', 'revol_bal', 'open_acc',
       'collections_12_mths_ex_med', 'delinq_2yrs', 'earliest_cr_line',
       'fico_range_low', 'ratio_mth_inc_all_payments', 'annual_inc'],
      dtype='object', length=104)

In [25]:
from collections import namedtuple
int_return = namedtuple('interest_rate', 'avg_return')

In [None]:
C = [(0.15.22, 0.0884),(0.1817, 0.09), (0.2196, 0.0961), (23.47, 8.97)]

In [None]:
df['roi']

# Predicting the test set


In [29]:
pred = model.predict_proba(X_test)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.7s finished


In [31]:
default_rates = [i[1] for i in pred]

In [34]:
zip(y_test, default_rates)

[('delinq', 0.21334090619574103)]

In [46]:
y_values = [int(i) for i in y_test.values]

In [48]:
pred_prob = list(zip(y_values, default_rates))

In [50]:
df_risky = df[df.grade.isin(['C', 'D', 'E'])]

In [55]:
df_risky

Unnamed: 0,total_pymnt,zip_code,member_id,id,loan_amnt,int_rate,installment,emp_length,home_ownership,grade,...,delinq_2yrs,earliest_cr_line,fico_range_low,last_credit_pull_d,ratio_inc_debt,ratio_inc_installment,ratio_mth_inc_all_payments,year_issued,month_issued,delinq
1,1008.710000,309,1314167,1077430,2500,0.1527,59.83,0,RENT,C,...,0,55,740,2013-09-28,0.139567,0.023932,0.037428,2011,12,1
2,3003.653644,606,1313524,1077175,2400,0.1596,84.33,20,RENT,C,...,0,57,735,2015-09-28,0.437153,0.082595,0.140500,2011,12,0
3,12226.302212,917,1277178,1076863,10000,0.1349,339.31,20,RENT,C,...,0,52,690,2015-01-28,0.317033,0.082759,0.110066,2011,12,0
6,7456.520000,280,1304742,1069639,7000,0.1596,170.08,8,RENT,C,...,0,61,690,2015-09-28,0.526040,0.043421,0.133929,2011,12,0
7,3938.144334,900,1288686,1072053,3000,0.1864,109.43,9,RENT,E,...,0,63,660,2014-12-28,0.233771,0.027358,0.068463,2011,12,0
10,7677.520000,853,1305201,1070078,6500,0.1465,153.45,5,OWN,C,...,0,54,695,2013-07-28,0.146278,0.025575,0.039015,2011,12,0
12,2270.700000,245,1298717,1064687,9000,0.1349,305.38,0,RENT,C,...,0,60,710,2012-11-28,0.648400,0.122152,0.205768,2011,12,1
15,1270.171106,641,1304871,1069759,1000,0.1629,35.31,0,RENT,D,...,0,63,665,2014-12-28,0.268714,0.015133,0.071053,2011,12,0
16,12519.260450,921,1299699,1065775,10000,0.1527,347.98,4,RENT,C,...,0,54,670,2015-04-28,0.810548,0.099423,0.236811,2011,12,0
20,27663.042671,770,1284848,1069740,20250,0.1527,484.63,3,RENT,C,...,0,53,725,2015-09-28,0.877634,0.134092,0.232665,2011,12,0


In [99]:
from scripts.model import create_matrix
y, X = create_matrix(df_risky)

In [112]:
X = X.drop('default_chance', axis=1)

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [117]:
pred = model.predict_proba(X_test)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.6s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.6s finished


In [118]:
default_prob = [i[1] for i in pred]

In [125]:
X_test.default_chance = default_prob

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [124]:
X_test.target = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [127]:
sorted_X = X_test.sort_values('default_chance')

In [128]:
len(sorted_X)

36466

In [136]:
total_default_rate = X_test.target.mean()

In [143]:
top_20_percent = sorted_X[:len(sorted_X) // 5]

In [152]:
top_20_percent.target.value_counts()

0    7249
1      44
Name: target, dtype: int64

In [144]:
top_20_mean = top_20_percent.target.mean()

Eat up half the returns so halving the default rate corresponds to an increase from 9% ROI to 13.5% according to the test set.

In [141]:
total_default_rate

0.12688531783030768

In [142]:
default_rates

[0.21334090619574103,
 0.31338246733506114,
 0.16143843462913246,
 0.0696714603666133,
 0.15040547653121877,
 0.095839268644744902,
 0.065725037568162031,
 0.056240467103940005,
 0.066372089155982458,
 0.017576535541752935,
 0.12727276582261329,
 0.14005938194634787,
 0.13224133258633275,
 0.2438762039378847,
 0.16517943883280961,
 0.19408451248914468,
 0.070524107997422605,
 0.25862181798172834,
 0.109820218364894,
 0.034593708233587803,
 0.15910222632625193,
 0.19727084390684635,
 0.18200658208430806,
 0.037104097705004217,
 0.034422017963399096,
 0.17984749151618268,
 0.097086162581718316,
 0.061888964146416786,
 0.0207003798746151,
 0.089631639093707791,
 0.018746288613030418,
 0.12389849692985139,
 0.12042606713331613,
 0.076856857881564211,
 0.044833118971692482,
 0.042550621153838002,
 0.081105162314306117,
 0.16534131904903557,
 0.019595748291998793,
 0.10580131602719003,
 0.019088381934792707,
 0.080541378899786076,
 0.15989573114943703,
 0.024107139883736028,
 0.0867474207002

In [149]:
type(top_20_percent)

pandas.core.frame.DataFrame

In [150]:
top_20_percent

Unnamed: 0,Intercept,C(home_ownership)[T.OTHER],C(home_ownership)[T.OWN],C(home_ownership)[T.RENT],C(grade)[T.B],C(grade)[T.C],C(grade)[T.D],C(grade)[T.E],C(grade)[T.F],C(grade)[T.G],...,revol_bal,open_acc,collections_12_mths_ex_med,delinq_2yrs,earliest_cr_line,fico_range_low,ratio_mth_inc_all_payments,annual_inc,default_chance,target
148235,1,0,0,0,0,1,0,0,0,0,...,27452,13,0,0,52,690,0.100667,142000.0,0.005793,0
110071,1,0,0,0,0,1,0,0,0,0,...,38305,14,0,0,38,710,0.106553,179000.0,0.009566,0
83905,1,0,0,0,0,1,0,0,0,0,...,33268,15,0,0,46,715,0.211185,74000.0,0.010413,0
207401,1,0,0,0,0,1,0,0,0,0,...,17687,15,0,0,46,715,0.072727,154693.0,0.012828,0
97848,1,0,0,0,0,1,0,0,0,0,...,26247,6,0,0,50,675,0.113598,130000.0,0.013003,0
150275,1,0,0,0,0,1,0,0,0,0,...,5368,9,0,1,48,710,0.045789,104250.0,0.013247,0
202457,1,0,0,0,0,1,0,0,0,0,...,36982,12,0,0,49,720,0.128702,140000.0,0.013653,0
116990,1,0,0,0,0,1,0,0,0,0,...,23477,8,0,0,47,660,0.090354,125000.0,0.015293,0
168871,1,0,0,0,0,1,0,0,0,0,...,79207,15,0,0,46,720,0.248128,135000.0,0.015567,0
180527,1,0,0,1,0,1,0,0,0,0,...,2193,5,0,0,52,665,0.030679,84000.0,0.016602,0
