In [1]:
#import libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, Pipeline
from mlxtend.evaluate import plot_learning_curves
from dateutil.relativedelta import relativedelta


%matplotlib inline


In [2]:
'''Helper functions'''
def evaluate_model(model, train_data, target_train):
    #Have to look at new docs for StratifiedKFold and decide if worth adapting
    mean_recall = cross_val_score(model, train_data, target_train, cv = 5, scoring = 'recall').mean()
    mean_roc_auc = cross_val_score(model, train_data, target_train, cv = 5, scoring = 'roc_auc').mean()
    print "Mean recall, cross-val, is:", mean_recall
    print "Mean roc_auc, cross-val, is:", mean_roc_auc
    return mean_recall, mean_roc_auc

def eval_test_data(model, train_data, target_train, test_data, target_test):
    model.fit(train_data, target_train)
    y_pred = model.predict(test_data)
    cm = confusion_matrix(target_test, y_pred)
    cr = classification_report(target_test, y_pred)
    test_roc_auc = roc_auc_score(target_test, model.predict_proba(test_data)[:,1])
    test_recall = recall_score(target_test, y_pred)
    pred_default_rate = sum(cm[:,1])/float(sum(sum(cm)))
    act_default_rate = sum(cm[1])/float(sum(sum(cm)))
    observed_default_rate = float(cm[1, 0])/sum(cm[:, 0])
    print 'Confusion Matrix'
    print cm
    print cr
    print "Predicted Default Rate:", pred_default_rate
    print "Actual Default Rate for the Baseline:", act_default_rate
    print "Observed Default Rate for only lending to those you think are good credits:", observed_default_rate
    print "ROC_AUC from test data is:", test_roc_auc
    print "Recall from test data is:", test_recall
    return test_roc_auc, test_recall, pred_default_rate, act_default_rate, observed_default_rate

'''End helper functions'''

'End helper functions'

**For this analysis, I aim to:**

1) Separate on-time loans from late and defaulted loans (both repaid and pending). In reg filings, there appear to be a number of Pending loans that are current despite past their maturity dates.

2) Do the same as #1, but with the loans with ARV details too.

**First, I want to visualize the loans by tranche.**

1) To keep things simple, we'll visualize 2015 loans and 2016 loans. Needless to say, 2016 loans should have fewer Pendings.

2) For analysis above, might have to bunch the tranches together for meaning, but would prefer to analyze 2015 and 2016 separately.

In [3]:
#Reading in data
groundfloor_v2 = pd.read_csv('groundfloor_data_mar1_processed_ml.csv', index_col = "Unnamed: 0")

In [4]:
groundfloor_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 192
Data columns (total 43 columns):
address                             193 non-null object
grade                               193 non-null object
int_rate                            193 non-null float64
link_loan_details                   193 non-null object
ltv                                 193 non-null float64
term_mo                             193 non-null float64
full_address                        193 non-null object
zipcode                             193 non-null object
borrower_company                    193 non-null object
borrower_principal                  193 non-null object
loan_position                       193 non-null object
loan_amount                         193 non-null float64
loan_status                         193 non-null object
funded_date                         190 non-null object
inception_date                      190 non-null object
repaid_date                         190 non-null ob

In [None]:
#Getting Inception Date into datetime
get_year = lambda x: x.year
groundfloor_v2['inception_date'] = pd.to_datetime(groundfloor_v2.inception_date, infer_datetime_format = True)
groundfloor_v2['maturity_date'] = pd.to_datetime(groundfloor_v2.maturity_date, infer_datetime_format = True)


In [None]:
for i in range(len(groundfloor_v2['repaid_date'])):
    if groundfloor_v2.repaid_date[i] != 'Pending':
        groundfloor_v2.set_value(i, 'repaid_date', pd.to_datetime(groundfloor_v2.repaid_date[i], infer_datetime_format = True))


In [None]:
#month also
get_month = lambda x: x.month

In [None]:
groundfloor_v2 = groundfloor_v2.assign(loan_inception_year = groundfloor_v2.inception_date.apply(get_year))
groundfloor_v2 = groundfloor_v2.assign(loan_inception_month = groundfloor_v2.inception_date.apply(get_month))

In [None]:
#LOans in 2015 and 2016
print pd.pivot_table(groundfloor_v2, index = 'loan_inception_year', values = ['loan_amount', 'int_rate'], aggfunc = [np.mean, np.sum, len])

'''Note: Given the small number of loans in 2015, could be less meaningful for analysis. Group the 2014 and 2015 together?'''

In [None]:
#Create indicator var for Repaid (on time), Late (but repaid), and Pending (and not paid, which is worst).
#Will apply to loans with maturity date <= Mar 3, 2017

print "Loans with maturity before March 3, 2017:", len(groundfloor_v2[groundfloor_v2.maturity_date < pd.to_datetime('2017-03-03')].index)
print "Loans with maturity after March 3, 2017:",len(groundfloor_v2[groundfloor_v2.maturity_date >= pd.to_datetime('2017-03-03')].index)

for i in groundfloor_v2[groundfloor_v2.maturity_date < pd.to_datetime('2017-03-03')].index:
    if groundfloor_v2.repaid_date[i] == 'Pending':
        groundfloor_v2.set_value(i, 'repaid_status', '2 - Pending')
    elif groundfloor_v2.repaid_date[i] <= groundfloor_v2.maturity_date[i]:
        groundfloor_v2.set_value(i, 'repaid_status', '0 - Repaid_On_Time')
    elif groundfloor_v2.repaid_date[i] > groundfloor_v2.maturity_date[i]:
        groundfloor_v2.set_value(i, 'repaid_status', '1 - Late')
    else:
        groundfloor_v2.set_value(i, 'repaid_status', None)

for i in groundfloor_v2[groundfloor_v2.maturity_date >= pd.to_datetime('2017-03-03')].index:
    if groundfloor_v2.repaid_date[i] == 'Pending':
        groundfloor_v2.set_value(i, 'repaid_status', 'Current')
    else:
        groundfloor_v2.set_value(i, 'repaid_status', '0 - Repaid_On_Time')

In [None]:
#The one loan that had a principal writedown.
print groundfloor_v2.loan_inception_year[groundfloor_v2.address == '174 Timothy Drive']

#Looking for the one that paid less than full interest but still paid principal. Index 167 is my best guess based on this: http://blog.groundfloor.us/groundfloorblog/post-mortem-of-a-problem-loan
groundfloor_v2[['grade', 'address', 'loan_amount', 'inception_date', 'funded_date', 'maturity_date']][groundfloor_v2.loan_amount == 40000]

In [None]:
print "Loan Tranches and Repaid Status"
print groundfloor_v2.groupby(['loan_inception_year']).repaid_status.value_counts()
print""
print "Loan Tranches and Repaid Status, Percentages"
print groundfloor_v2.groupby(['loan_inception_year']).repaid_status.value_counts(True)
print ""
print "Amount Loaned out Broken down by Loan Tranche and Repaid Status"
print groundfloor_v2.groupby(['loan_inception_year', 'repaid_status']).loan_amount.sum()

'''Worth noting how in 2014, 41% of loans were paid but paid late. 

And worth noting that 2015 includes one loan that paid less than full interest, and one loan with a principal default. 
Both are in the Late bucket.

Kind of funny, and annoying, how Groundfloor touts success on the loans that are repaid.

And weird that my stats do not match theirs for repaid loans in terms of timeliness. The stats below do not show 82% of 
repaid loans having paid before or on maturity.'''

In [None]:
#Creating repaid status ind for loans not current. And then for repaid vs late/default for binary classification.
for i in groundfloor_v2.repaid_status.index:
    if groundfloor_v2.repaid_status[i] == 'Current':
        groundfloor_v2.set_value(i, 'repaid_multiclass', None)
    elif groundfloor_v2.repaid_status[i] == '0 - Repaid_On_Time':
        groundfloor_v2.set_value(i, 'repaid_multiclass', 0)
    elif groundfloor_v2.repaid_status[i] == '1 - Late':
        groundfloor_v2.set_value(i, 'repaid_multiclass', 1)
    elif groundfloor_v2.repaid_status[i] == '2 - Pending':
        groundfloor_v2.set_value(i, 'repaid_multiclass', 2)
    else:
        groundfloor_v2.set_value(i, 'repaid_multiclass', None)




In [None]:
for i in groundfloor_v2.repaid_multiclass.index:
    if np.isnan(groundfloor_v2.repaid_multiclass[i]) == True:
        groundfloor_v2.set_value(i, 'repaid_binary', None)
    elif groundfloor_v2.repaid_multiclass[i] == 0:
        groundfloor_v2.set_value(i, 'repaid_binary', 0)
    else:
        groundfloor_v2.set_value(i, 'repaid_binary', 1)

<h2>Binary Classification of 2015 Tranche </h2>

I risk having models that are not useful with only the 2015 tranche, which has 38 loans. Nevertheless, I will:

1) Classify the 38;

2) Combine with 2014 loans and classify;

If after I feel that this was not useful, I will combine 2014 through 16 and attempt binary classification.

In [None]:
#There are not enough F or G loans for machine learning. There may not be enough E loans for ML unless using 
#2014 and 2015 together. Seems that we will want to focus on A through D loans for ML.
print groundfloor_v2.groupby('loan_inception_year').grade.value_counts()
print groundfloor_v2.groupby('loan_inception_year').encoded_grade.value_counts()

In [None]:
X = groundfloor_v2[['int_rate', 'ltv',
       'term_mo', 'encoded_loan_position', 'loan_amount',
        'investors',
        'encoded_grade', 'purpose_Acquisition & Renovation',
        'purpose_New Construction', 'purpose_Refinance',
        'purpose_Renovation']][groundfloor_v2['repaid_binary'].notnull() & groundfloor_v2['encoded_grade']<4][groundfloor_v2.loan_inception_year == 2015]

In [None]:
#Creating my train test split


target = groundfloor_v2['repaid_binary'][groundfloor_v2['repaid_binary'].notnull() & groundfloor_v2['encoded_grade']<4][groundfloor_v2.loan_inception_year == 2015]


X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = .20, stratify = target, random_state=31)


In [None]:
X.info()

In [None]:
scores_prelim = {}

#Preliminary models - will go with what has best recall or AUC for optimization of hyperparameters
pipe_logreg_l2 = make_pipeline(MinMaxScaler(), LogisticRegression(class_weight = 'balanced', random_state = 31))
pipe_svc = make_pipeline(MinMaxScaler(), SVC(probability = True, class_weight = 'balanced', random_state = 31))
dt = DecisionTreeClassifier(class_weight = 'balanced', random_state = 31)
dtbag = BaggingClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31, n_estimators = 50)
rf = RandomForestClassifier(random_state = 31, class_weight = 'balanced', n_estimators = 20)
et = ExtraTreesClassifier(random_state = 31, class_weight = 'balanced')
adaboost = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31)
adaboost_point1 = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31, learning_rate = .1)
adaboost_lr_point1 = AdaBoostClassifier(LogisticRegression(class_weight = 'balanced', random_state = 31), random_state = 31, learning_rate = .1)
adaboost_lr = AdaBoostClassifier(LogisticRegression(class_weight = 'balanced', random_state = 31), random_state = 31, learning_rate = 1)
gradient_dev = GradientBoostingClassifier(loss = 'deviance', learning_rate = .1, verbose = False)
gradient_exp = GradientBoostingClassifier(loss = 'exponential', learning_rate = .1, verbose = False)
gradient_exp_1 = GradientBoostingClassifier(loss = 'exponential', learning_rate = 1, verbose = False)



In [None]:

models = [pipe_logreg_l2, pipe_svc, dt, dtbag, rf, et, adaboost, adaboost_point1, adaboost_lr, adaboost_lr_point1, gradient_dev, gradient_exp, gradient_exp_1]
model_names = ['Logistic Regression', 'SVC', 'DT', 'Bagged DT', 'Random Forest', 'Extra Trees', 'AdaBoost', 'AdaBoost 0.1', 'AdaBoost LR', 'AdaBoost LR 0.1','Gradient Deviance', 'Gradient Exp', 'Gradient_Exp_1']

#Fix or just do a damn simple model....
for model, model_name in zip(models, model_names):
    print ""
    print model_name + " Output for Groundfloor Loans from 2015 Tranche"
    print ""
    scores_prelim[model_name] = evaluate_model(model, X_train, y_train)


**The ones worth exploring are  1)  DT and RF and ET; 2) Gradient Dev Exp.; 3) Adaboost with tree base; and 4) LR And SVC**

Recall and ROC seem high


In [None]:
#DT


dt_params = {'max_depth': [1,2,3,4,5,6,7,8,9,10], 'min_samples_leaf': [2,4,6], \
'min_samples_split': [2,4,6,8,10], 'criterion': ['gini', 'entropy'], 'splitter': ['random', 'best']}

gs_dtopt = GridSearchCV(dt, dt_params, scoring = 'recall', n_jobs = -1, cv=5, verbose = True)
gs_dtopt.fit(X_train, y_train)
print "Best recall score:", gs_dtopt.best_score_

print "Best params:", gs_dtopt.best_params_
scores_prelim['gs_dtopt'] = evaluate_model(gs_dtopt.best_estimator_, X_train, y_train)
#Interesting - the term of the loan, the ltv, and # of investors, but not the grade...!
pd.DataFrame(X_train.columns.values, gs_dtopt.best_estimator_.feature_importances_)


In [None]:
#Bagged DT with opt dt params
dtbag_optdt = BaggingClassifier(DecisionTreeClassifier(class_weight = 'balanced', criterion = 'entropy', splitter = 'best', random_state = 31, max_depth = 2, min_samples_split = 2, min_samples_leaf = 4), random_state = 31)
bagging_params = {'n_estimators': [10, 20, 50, 100, 150, 200], 'bootstrap_features': [True, False]}
gs_dtbag = GridSearchCV(dtbag_optdt, bagging_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_dtbag.fit(X_train, y_train)
print "Best recall score:", gs_dtbag.best_score_
print "Best params:", gs_dtbag.best_params_

scores_prelim['gs_dtbag'] = evaluate_model(gs_dtbag.best_estimator_, X_train, y_train)
#Good recall and similar ROC AUC to Lending Club

In [None]:
#Bagged DT on own
dtbag_optdt_unopt = BaggingClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31)
bagging_params = {'n_estimators': [10, 20, 50, 100, 150, 200], 'bootstrap_features': [True, False]}
gs_dtbag_unopt = GridSearchCV(dtbag_optdt_unopt, bagging_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_dtbag_unopt.fit(X_train, y_train)
print "Best recall score:", gs_dtbag_unopt.best_score_
print "Best params:", gs_dtbag_unopt.best_params_

scores_prelim['gs_dtbag_unopt'] = evaluate_model(gs_dtbag_unopt.best_estimator_, X_train, y_train)
#Better with optimized DT?

In [None]:
#Random Forest-optdt
rf_optdt = RandomForestClassifier(random_state = 31, class_weight = 'balanced', n_estimators = 20, min_samples_leaf = 4, min_samples_split = 2, max_depth = 2)
random_params = {'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2', None], 'n_estimators': [100, 200, 300]}
gs_rf_optdt = GridSearchCV(rf_optdt, random_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_rf_optdt.fit(X_train, y_train)
print 'Best Recall Score:', gs_rf_optdt.best_score_
print "Best Params:", gs_rf_optdt.best_params_

#Not bad compared to before...
scores_prelim['gs_rf_optdt'] = evaluate_model(gs_rf_optdt.best_estimator_, X_train, y_train)


In [None]:
#Random Forest as is.
gs_rf = GridSearchCV(rf, random_params, cv = 5, verbose = True, n_jobs = -1)
gs_rf.fit(X_train, y_train)

print 'Best Recall Score:', gs_rf.best_score_
print "Best Params:", gs_rf.best_params_
np.mean(cross_val_score(gs_rf.best_estimator_, X_train, y_train, cv=5, scoring = 'recall'))

#RF as is is better than DT Bag ...
scores_prelim['gs_rf'] = evaluate_model(gs_rf.best_estimator_, X_train, y_train)

In [None]:
#Extra Trees with opt dt
et_opt = ExtraTreesClassifier(random_state = 31, class_weight = 'balanced', max_depth = 2, min_samples_split = 2, min_samples_leaf = 4)
et_params = {'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2', None], 'n_estimators': [100, 200, 300]}

gs_et_opt = GridSearchCV(et_opt, et_params, cv = 5, verbose = True, n_jobs = -1)
gs_et_opt.fit(X_train, y_train)

print 'Best Recall Score:', gs_et_opt.best_score_
print "Best Params:", gs_et_opt.best_params_




In [None]:
scores_prelim['gs_et_opt'] = evaluate_model(gs_et_opt.best_estimator_, X_train, y_train)

In [None]:
#Extra Trees without opt dt
et = ExtraTreesClassifier(random_state = 31, class_weight = 'balanced')
et_params = {'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2', None], 'n_estimators': [100, 200, 300]}

gs_et = GridSearchCV(et, et_params, cv = 5, verbose = True, n_jobs = -1)
gs_et.fit(X_train, y_train)

print 'Best Recall Score:', gs_et.best_score_
print "Best Params:", gs_et.best_params_


#RF as is is better than DT Bag ...
scores_prelim['gs_et'] = evaluate_model(gs_et.best_estimator_, X_train, y_train)

In [None]:
#Adaboost
adaboost = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31)
adaboost_params = {'n_estimators': [10,20,30,50,100], 'learning_rate': [.1,.3,.5,1]}
gs_adaboost = GridSearchCV(adaboost, adaboost_params, cv = 5, verbose = True, n_jobs = -1)
gs_adaboost.fit(X_train, y_train)

print "Best recall score:", gs_adaboost.best_score_
print "Best params:", gs_adaboost.best_params_
scores_prelim['gs_adaboost'] = evaluate_model(gs_adaboost.best_estimator_, X_train, y_train)

In [None]:
##Gradient Exp - test with opt dt and without.

gradient_exp_optdt = GradientBoostingClassifier(loss = 'exponential', learning_rate = .1, n_estimators = 200, max_depth = 2, min_samples_leaf = 2, min_samples_split = 4)
gradient_params = {'loss': ['deviance', 'exponential'], 'learning_rate': [.1, .3, .5, 1], 'n_estimators': [10, 20, 50, 100, 200, 300, 400, 500]}
gs_gradient_optdt= GridSearchCV(gradient_exp_optdt, gradient_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_gradient_optdt.fit(X_train, y_train)
#not great at all...
print "Best recall score:", gs_gradient_optdt.best_score_
print "Best params:", gs_gradient_optdt.best_params_

scores_prelim['gs_Gradient_optdt'] = evaluate_model(gs_gradient_optdt.best_estimator_, X_train, y_train)


In [None]:
#Gradient Exp - test without opt dt.

gradient_exp = GradientBoostingClassifier(loss = 'exponential', learning_rate = .1, n_estimators = 200)
gradient_params = {'loss': ['deviance', 'exponential'], 'learning_rate': [.1, .3, .5, 1], 'n_estimators': [10, 20, 50, 100, 200, 300, 400, 500]}
gs_gradient= GridSearchCV(gradient_exp, gradient_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_gradient.fit(X_train, y_train)

print "Best recall score:", gs_gradient.best_score_
print "Best params:", gs_gradient.best_params_

scores_prelim['gs_Gradient'] = evaluate_model(gs_gradient.best_estimator_, X_train, y_train)


In [None]:
#SVC
svc_params = {'svc__C': [.1, 1, 10, 100, 1000], 'svc__max_iter': [30000]}
gs_svc = GridSearchCV(pipe_svc, svc_params, cv = 5, verbose = True, n_jobs = -1)
gs_svc.fit(X_train, y_train)

print "Best recall score:", gs_svc.best_score_
print "Best params:", gs_svc.best_params_

scores_prelim['gs_svc'] = evaluate_model(gs_svc.best_estimator_, X_train, y_train)


In [None]:
pipe_

In [None]:
C = [.001, .01, .1, 1, 10, 100, 1000]
for c in C:
    pipe_logreg_l2 = make_pipeline(MinMaxScaler(), LogisticRegression(C = c, class_weight = 'balanced', random_state = 31))
    print "Model output for C of ", c
    evaluate_model(pipe_logreg_l2, X_train, y_train)
    print ""

In [None]:
#C of 10 for Logreg
pipe_logreg_l2 = make_pipeline(MinMaxScaler(), LogisticRegression(C = 10, class_weight = 'balanced', random_state = 31))
scores_prelim['gs_logreg'] = evaluate_model(pipe_logreg_l2, X_train, y_train)

In [None]:
#Looking for best models based on recall
pd.DataFrame(scores_prelim, index = ['recall', 'roc_auc']).T.sort_values(by = 'recall', ascending = False)



In [None]:
scores_test_data = {}

test_models = [gradient_dev, gs_dtopt.best_estimator_, gs_rf.best_estimator_, gs_et.best_estimator_, gs_svc.best_estimator_, gs_dtbag_unopt.best_estimator_, gs_adaboost.best_estimator_]
test_names = ['Gradient Dev as is', 'Decision Tree, Optimized', 'Random Forest sans DT Opt', 'Extra Trees sans DT Opt', 'SVC', 'DT Bagged sans Opt DT', 'Adaboost Opt.']

for model, name in zip(test_models, test_names):
    print ""
    print "Output for %s on Groundfloor Loan Data for 2015 Tranche" % name
    scores_test_data[name] = eval_test_data(model, X_train, y_train, X_test, y_test)
    print ""

**I expected there to be issues because the REcall and ROC seemed too good on the training data. Looks like I was correct.**

**3 Options at this juncture**
1) Blend in 2014 tranche and see what happens.
2) Blend in 2016 tranche also.
3) Brainstorm other economic data to obtain for these loans. (Longer term effort).

<h2> Machine Learning on 2014-15 Blended Tranches </h2>

In [198]:
X = groundfloor_v2[['int_rate', 'ltv',
       'term_mo', 'encoded_loan_position', 'loan_amount',
        'investors',
        'encoded_grade', 'purpose_Acquisition & Renovation',
        'purpose_New Construction', 'purpose_Refinance',
        'purpose_Renovation']][groundfloor_v2['repaid_binary'].notnull()][groundfloor_v2['encoded_grade']<4][groundfloor_v2.loan_inception_year<=2015]



In [199]:
target = groundfloor_v2['repaid_binary'][groundfloor_v2['repaid_binary'].notnull()][groundfloor_v2['encoded_grade']<4][groundfloor_v2.loan_inception_year<=2015]

In [200]:
X.head()

Unnamed: 0,int_rate,ltv,term_mo,encoded_loan_position,loan_amount,investors,encoded_grade,purpose_Acquisition & Renovation,purpose_New Construction,purpose_Refinance,purpose_Renovation
107,0.096,0.374,12.0,0,59500.0,44.0,1,1.0,0.0,0.0,0.0
108,0.13,0.597,6.0,0,185000.0,248.0,2,1.0,0.0,0.0,0.0
109,0.156,0.615,6.0,0,160000.0,264.0,3,1.0,0.0,0.0,0.0
111,0.096,0.446,12.0,0,50000.0,32.0,1,0.0,0.0,0.0,1.0
112,0.134,0.516,12.0,0,284000.0,175.0,2,0.0,1.0,0.0,0.0


In [201]:
target.head()

107    1.0
108    1.0
109    1.0
111    1.0
112    1.0
Name: repaid_binary, dtype: float64

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = .2, stratify = target, random_state = 31)

In [203]:
scores_prelim = {}

#Preliminary models - will go with what has best recall or AUC for optimization of hyperparameters
pipe_logreg_l2 = make_pipeline(MinMaxScaler(), LogisticRegression(class_weight = 'balanced', random_state = 31))
pipe_svc = make_pipeline(MinMaxScaler(), SVC(probability = True, class_weight = 'balanced', random_state = 31))
dt = DecisionTreeClassifier(class_weight = 'balanced', random_state = 31)
dtbag = BaggingClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31, n_estimators = 50)
rf = RandomForestClassifier(random_state = 31, class_weight = 'balanced', n_estimators = 20)
et = ExtraTreesClassifier(random_state = 31, class_weight = 'balanced')
adaboost = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31)
adaboost_point1 = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31, learning_rate = .1)
adaboost_lr_point1 = AdaBoostClassifier(LogisticRegression(class_weight = 'balanced', random_state = 31), random_state = 31, learning_rate = .1)
adaboost_lr = AdaBoostClassifier(LogisticRegression(class_weight = 'balanced', random_state = 31), random_state = 31, learning_rate = 1)
gradient_dev = GradientBoostingClassifier(loss = 'deviance', learning_rate = .1, verbose = False)
gradient_exp = GradientBoostingClassifier(loss = 'exponential', learning_rate = .1, verbose = False)
gradient_exp_1 = GradientBoostingClassifier(loss = 'exponential', learning_rate = 1, verbose = False)



In [204]:

models = [pipe_logreg_l2, pipe_svc, dt, dtbag, rf, et, adaboost, adaboost_point1, adaboost_lr, adaboost_lr_point1, gradient_dev, gradient_exp, gradient_exp_1]
model_names = ['Logistic Regression', 'SVC', 'DT', 'Bagged DT', 'Random Forest', 'Extra Trees', 'AdaBoost', 'AdaBoost 0.1', 'AdaBoost LR', 'AdaBoost LR 0.1','Gradient Deviance', 'Gradient Exp', 'Gradient_Exp_1']

#Fix or just do a damn simple model....
for model, model_name in zip(models, model_names):
    print ""
    print model_name + " Output for Groundfloor Loans from 2014-2015 Tranche"
    print ""
    scores_prelim[model_name] = evaluate_model(model, X_train, y_train)



Logistic Regression Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.35
Mean roc_auc, cross-val, is: 0.525

SVC Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.4
Mean roc_auc, cross-val, is: 0.4875

DT Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.6
Mean roc_auc, cross-val, is: 0.541666666667

Bagged DT Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.6
Mean roc_auc, cross-val, is: 0.504166666667

Random Forest Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.6
Mean roc_auc, cross-val, is: 0.604166666667

Extra Trees Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.5
Mean roc_auc, cross-val, is: 0.602083333333

AdaBoost Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.55
Mean roc_auc, cross-val, is: 0.541666666667

AdaBoost 0.1 Output for Groundfl

**Worth looking at SVC, DT, RF, ET, Adaboost and Gradient.**

In [205]:
#SVC
gs_svc = GridSearchCV(pipe_svc, svc_params, cv = 5, scoring = 'recall', verbose = True, n_jobs = -1)
gs_svc.fit(X_train, y_train)

print "Best Recall Score:", gs_svc.best_score_
print "Params:", gs_svc.best_params_

scores_prelim['gs_svc'] = evaluate_model(gs_svc.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  18 out of  25 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.4s finished


Best Recall Score: 0.655405405405
Params: {'svc__C': 1000, 'svc__max_iter': 30000}
Mean recall, cross-val, is: 0.65
Mean roc_auc, cross-val, is: 0.516666666667


In [206]:
#DT
gs_dtopt = GridSearchCV(dt, dt_params, scoring = 'recall', n_jobs = -1, cv=5, verbose = True)
gs_dtopt.fit(X_train, y_train)
print "Best recall score:", gs_dtopt.best_score_

print "Best params:", gs_dtopt.best_params_
scores_prelim['gs_dtopt'] = evaluate_model(gs_dtopt.best_estimator_, X_train, y_train)
#Interesting - int rate, ltv, term_mo, loan_amount, and purpose_New Construction
pd.DataFrame(X_train.columns.values, gs_dtopt.best_estimator_.feature_importances_)


Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 2168 tasks      | elapsed:   10.0s


Best recall score: 0.655405405405
Best params: {'min_samples_split': 2, 'splitter': 'best', 'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 2}
Mean recall, cross-val, is: 0.65
Mean roc_auc, cross-val, is: 0.558333333333


[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:   13.2s finished


Unnamed: 0,0
0.111438,int_rate
0.417912,ltv
0.157078,term_mo
0.0,encoded_loan_position
0.178273,loan_amount
0.0,investors
0.0,encoded_grade
0.0,purpose_Acquisition & Renovation
0.1353,purpose_New Construction
0.0,purpose_Refinance


In [207]:
#Random Forest
rf_params = {'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2', None], 'bootstrap': [True, False], 'n_estimators': [10, 20, 30, 50, 100, 200]}
gs_rf = GridSearchCV(rf, rf_params, cv = 5, scoring = 'recall', verbose = True, n_jobs = -1)
gs_rf.fit(X_train, y_train)

print "Best recall score:", gs_rf.best_score_
print "Best params:", gs_rf.best_params_

scores_prelim['gs_rf'] = evaluate_model(gs_rf.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done 457 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  1.7min finished


Best recall score: 0.702702702703
Best params: {'max_features': None, 'n_estimators': 10, 'bootstrap': True, 'criterion': 'entropy'}
Mean recall, cross-val, is: 0.7
Mean roc_auc, cross-val, is: 0.572916666667


In [208]:
#Feature Importance for RF
pd.DataFrame(gs_rf.best_estimator_.feature_importances_, index = X_train.columns.values)

Unnamed: 0,0
int_rate,0.130087
ltv,0.415222
term_mo,0.067108
encoded_loan_position,0.0
loan_amount,0.121512
investors,0.170076
encoded_grade,0.044714
purpose_Acquisition & Renovation,0.007855
purpose_New Construction,0.021817
purpose_Refinance,0.0


In [209]:
#Random Forest with opt dt
rf_optdt = RandomForestClassifier(class_weight = 'balanced', random_state = 31, max_depth = 6, min_samples_split = 2, min_samples_leaf = 2)
gs_rf_opt = GridSearchCV(rf_optdt, rf_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_rf_opt.fit(X_train, y_train)

print "Best recall score:", gs_rf_opt.best_score_
print "Best params:", gs_rf_opt.best_params_

scores_prelim['gs_rf_opt'] = evaluate_model(gs_rf_opt.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 457 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  1.9min finished


Best recall score: 0.75
Best params: {'max_features': None, 'n_estimators': 30, 'bootstrap': True, 'criterion': 'gini'}
Mean recall, cross-val, is: 0.75
Mean roc_auc, cross-val, is: 0.554166666667


In [210]:
#Extra Trees
gs_et = GridSearchCV(et, rf_params, cv =5, verbose = True, n_jobs = -1, scoring = 'recall')
gs_et.fit(X_train, y_train)



Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 206 tasks      | elapsed:   53.5s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  2.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=31, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': ['auto', 'sqrt', 'log2', None], 'n_estimators': [10, 20, 30, 50, 100, 200], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='recall', verbose=True)

In [211]:
print "Best recall score:", gs_et.best_score_
print "Best params:", gs_et.best_params_

scores_prelim['gs_et'] = evaluate_model(gs_et.best_estimator_, X_train, y_train)


Best recall score: 0.702702702703
Best params: {'max_features': None, 'n_estimators': 200, 'bootstrap': False, 'criterion': 'entropy'}
Mean recall, cross-val, is: 0.7
Mean roc_auc, cross-val, is: 0.591666666667


In [212]:
#Extra Trees with Opt Dt
et_opt = ExtraTreesClassifier(class_weight = 'balanced', random_state = 31, max_depth = 6, min_samples_split = 2, min_samples_leaf = 2)
gs_et_opt = GridSearchCV(et_opt, rf_params, cv = 5, verbose = True, n_jobs = -1, scoring = 'recall')

gs_et_opt.fit(X_train, y_train)

print "Best recall score:", gs_et_opt.best_score_
print "Best params:", gs_et_opt.best_params_

scores_prelim['gs_et_opt'] = evaluate_model(gs_et_opt.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 458 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  1.7min finished


Best recall score: 0.608108108108
Best params: {'max_features': 'auto', 'n_estimators': 20, 'bootstrap': False, 'criterion': 'gini'}
Mean recall, cross-val, is: 0.6
Mean roc_auc, cross-val, is: 0.541666666667


In [213]:

#Adaboost with Opt DT params
adaboost = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31, max_depth = 6, min_samples_split = 2, min_samples_leaf = 2), random_state = 31)
adaboost_params = {'n_estimators': [10,20,30,50,100], 'learning_rate': [.1,.3,.5,1,2,5]}
gs_adaboost = GridSearchCV(adaboost, adaboost_params, cv = 5, verbose = True, n_jobs = -1)
gs_adaboost.fit(X_train, y_train)

print "Best recall score:", gs_adaboost.best_score_
print "Best params:", gs_adaboost.best_params_
scores_prelim['gs_adaboost'] = evaluate_model(gs_adaboost.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   11.2s finished


Best recall score: 0.621621621622
Best params: {'n_estimators': 30, 'learning_rate': 1}
Mean recall, cross-val, is: 0.7
Mean roc_auc, cross-val, is: 0.529166666667


In [214]:
#Adaboost on own
adaboost_own = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31)
gs_adaboost_own = GridSearchCV(adaboost_own, adaboost_params, cv = 5, scoring = 'recall', verbose = True, n_jobs = -1)
gs_adaboost_own.fit(X_train, y_train)


print "Best recall score:", gs_adaboost_own.best_score_
print "Best params:", gs_adaboost_own.best_params_
scores_prelim['gs_adaboost_own'] = evaluate_model(gs_adaboost_own.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    1.2s finished


Best recall score: 0.560810810811
Best params: {'n_estimators': 10, 'learning_rate': 0.1}
Mean recall, cross-val, is: 0.55
Mean roc_auc, cross-val, is: 0.541666666667


In [215]:
##Gradient Exp - test with opt dt 

gradient_exp_optdt = GradientBoostingClassifier(loss = 'exponential', learning_rate = .1, n_estimators = 200, max_depth = 6, min_samples_leaf = 2, min_samples_split = 2)
gradient_params = {'loss': ['deviance', 'exponential'], 'learning_rate': [.1, .3, .5, 1, 2, 5, 10], 'n_estimators': [10, 20, 50, 100, 200, 300, 400, 500]}
gs_gradient_optdt= GridSearchCV(gradient_exp_optdt, gradient_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_gradient_optdt.fit(X_train, y_train)
#not great at all...
print "Best recall score:", gs_gradient_optdt.best_score_
print "Best params:", gs_gradient_optdt.best_params_

scores_prelim['gs_Gradient_optdt'] = evaluate_model(gs_gradient_optdt.best_estimator_, X_train, y_train)


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   21.1s finished


Best recall score: 0.756756756757
Best params: {'n_estimators': 50, 'loss': 'exponential', 'learning_rate': 2}
Mean recall, cross-val, is: 0.65
Mean roc_auc, cross-val, is: 0.570833333333


In [216]:
#Gradient Exp - test without opt dt.

gradient_exp = GradientBoostingClassifier(loss = 'exponential', learning_rate = .1, n_estimators = 200)
gradient_params = {'loss': ['deviance', 'exponential'], 'learning_rate': [.1, .3, .5, 1, 2, 5, 10], 'n_estimators': [10, 20, 50, 100, 200, 300, 400, 500]}
gs_gradient= GridSearchCV(gradient_exp, gradient_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_gradient.fit(X_train, y_train)

print "Best recall score:", gs_gradient.best_score_
print "Best params:", gs_gradient.best_params_

scores_prelim['gs_Gradient'] = evaluate_model(gs_gradient.best_estimator_, X_train, y_train)


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   19.8s finished


Best recall score: 0.756756756757
Best params: {'n_estimators': 100, 'loss': 'deviance', 'learning_rate': 5}
Mean recall, cross-val, is: 0.7
Mean roc_auc, cross-val, is: 0.49375


In [217]:
#Best models are... gs_rf_opt, gradient_exp_1, gs_et, gs_adaboost, gs_rf, gs_svc, gs_dtopt
pd.DataFrame(scores_prelim, columns = scores_prelim.keys(), index = ['recall', 'roc_auc']).T.sort_values('recall', ascending = False)

Unnamed: 0,recall,roc_auc
gs_rf_opt,0.75,0.554167
Gradient_Exp_1,0.7,0.6
gs_et,0.7,0.591667
gs_adaboost,0.7,0.529167
gs_Gradient,0.7,0.49375
gs_rf,0.7,0.572917
gs_svc,0.65,0.516667
gs_dtopt,0.65,0.558333
gs_Gradient_optdt,0.65,0.570833
Gradient Exp,0.6,0.579167


In [218]:
test_scores = {}

#Based above above, I would probably use RF. What about test scores?
models = [gs_rf_opt.best_estimator_, gradient_exp_1, gs_et.best_estimator_, gs_adaboost.best_estimator_, gs_rf.best_estimator_, gs_svc.best_estimator_, gs_dtopt.best_estimator_]
model_names = ['RF - Opt DT Base', 'Gradient Boost - Exp, LR 1', 'ET sans Opt DT Base', 'Adaboost with Opt DT Base', 'Random Forest, Own', 'SVC', 'DT Optimized']

for model, model_name in zip(models, model_names):
    print "Output on Test Data for Groundfloor 2014-15 Tranches for Model %s" % model_name
    test_scores[model] = eval_test_data(model, X_train, y_train, X_test, y_test)
    print ""

Output on Test Data for Groundfloor 2014-15 Tranches for Model RF - Opt DT Base
Confusion Matrix
[[4 1]
 [2 3]]
             precision    recall  f1-score   support

        0.0       0.67      0.80      0.73         5
        1.0       0.75      0.60      0.67         5

avg / total       0.71      0.70      0.70        10

Predicted Default Rate: 0.4
Actual Default Rate for the Baseline: 0.5
Observed Default Rate for only lending to those you think are good credits: 0.333333333333
ROC_AUC from test data is: 0.56
Recall from test data is: 0.6

Output on Test Data for Groundfloor 2014-15 Tranches for Model Gradient Boost - Exp, LR 1
Confusion Matrix
[[4 1]
 [2 3]]
             precision    recall  f1-score   support

        0.0       0.67      0.80      0.73         5
        1.0       0.75      0.60      0.67         5

avg / total       0.71      0.70      0.70        10

Predicted Default Rate: 0.4
Actual Default Rate for the Baseline: 0.5
Observed Default Rate for only lending to 

**RF and Gradient Boost are best from Test. **

In [222]:
loans_test = pd.DataFrame(gs_rf_opt.best_estimator_.predict(X_test), index = X_test.index, columns = ['default'])

In [245]:
loaned_test_int_rates = X_test.int_rate[loans_test['default']==0]
late_loans_int_rate = X_test.int_rate[y_test==1][loans_test['default']==0]
print groundfloor_v2['repaid_status'].ix[[154,164]]

154    1 - Late
164    1 - Late
Name: repaid_status, dtype: object


In [241]:
print y_test[y_test==1][loans_test['default']==0]
groundfloor_v2[['repaid_status', 'int_rate', 'term_mo', 'grade', 'loan_inception_year', 'maturity_date']].ix[[154, 164]]

164    1.0
154    1.0
Name: repaid_binary, dtype: float64


Unnamed: 0,repaid_status,int_rate,term_mo,grade,loan_inception_year,maturity_date
154,1 - Late,0.096,9.0,B,2015.0,2016-07-05
164,1 - Late,0.132,6.0,C,2015.0,2016-01-27


**SVC, RF and ET have reasonable outcomes...**


In [246]:
X_2016 = groundfloor_v2[['int_rate', 'ltv', 'term_mo', 'encoded_loan_position',
       'loan_amount', 'investors', 'encoded_grade',
       'purpose_Acquisition & Renovation', 'purpose_New Construction',
       'purpose_Refinance', 'purpose_Renovation']][groundfloor_v2['repaid_binary'].notnull()][groundfloor_v2.encoded_grade <= 3][groundfloor_v2.loan_inception_year == 2016]



In [247]:
X_2016.head()

Unnamed: 0,int_rate,ltv,term_mo,encoded_loan_position,loan_amount,investors,encoded_grade,purpose_Acquisition & Renovation,purpose_New Construction,purpose_Refinance,purpose_Renovation
0,0.11,0.667,12.0,0,90000.0,197.0,2,1.0,0.0,0.0,0.0
38,0.134,0.589,9.0,0,245000.0,377.0,2,0.0,0.0,0.0,1.0
55,0.096,0.609,12.0,0,91380.0,205.0,1,1.0,0.0,0.0,0.0
73,0.13,0.608,6.0,0,425000.0,302.0,2,1.0,0.0,0.0,0.0
78,0.13,0.383,6.0,0,72750.0,90.0,2,0.0,0.0,0.0,1.0


In [248]:
#Curious about estimators vs 2016.
data_2016 = {}

for model, model_name in zip(models, model_names):
    print "Output on Test Data for Groundfloor 2014-15 Tranches for Model %s" % model_name
    data_2016[model] = eval_test_data(model, X, target, X_2016, y_2016)
    print ""

Output on Test Data for Groundfloor 2014-15 Tranches for Model RF - Opt DT Base
Confusion Matrix
[[17  8]
 [ 5  6]]
             precision    recall  f1-score   support

        0.0       0.77      0.68      0.72        25
        1.0       0.43      0.55      0.48        11

avg / total       0.67      0.64      0.65        36

Predicted Default Rate: 0.388888888889
Actual Default Rate for the Baseline: 0.305555555556
Observed Default Rate for only lending to those you think are good credits: 0.227272727273
ROC_AUC from test data is: 0.612727272727
Recall from test data is: 0.545454545455

Output on Test Data for Groundfloor 2014-15 Tranches for Model Gradient Boost - Exp, LR 1
Confusion Matrix
[[12 13]
 [ 6  5]]
             precision    recall  f1-score   support

        0.0       0.67      0.48      0.56        25
        1.0       0.28      0.45      0.34        11

avg / total       0.55      0.47      0.49        36

Predicted Default Rate: 0.5
Actual Default Rate for the Basel

**Conclusions from the above:**

1) Need more explanatory variables to better separate 2014 and 2015;
2) More data is good - helped improve performance from 2015 alone.
3) Try over-sampling...

In [169]:
loans = pd.DataFrame(gs_rf_opt.best_estimator_.predict(X_2016), X_2016.index, columns = ['default'])


In [170]:
#Still a risky portfolio...
X_2016.ix[loans[loans.default==0].index].encoded_grade.value_counts()


3    10
2    10
1     2
0     1
Name: encoded_grade, dtype: int64

In [171]:
((X_2016.ix[loans[loans.default==0].index].int_rate+1)*1000).sum()/23

1137.1739130434783

In [185]:
prin_int_good_loans = ((X_2016.ix[loans[loans.default==0].index].int_rate[y_2016.ix[loans[loans.default==0].index]==0]+1)*1000).sum()


In [186]:
prin_int_bad_loans = ((X_2016.ix[loans[loans.default==0].index].int_rate[y_2016.ix[loans[loans.default==0].index]==1]+1)*1000).sum()

In [188]:
prin_int_good_loans - (23*1000)

-3715.0

In [192]:
print "B/c the mean interest rate on these loans is %.2f, defaults higher than this put the portfolio at risk of principal loss." % X_2016.ix[loans[loans.default==0].index].int_rate.mean()
print ""
print "Even though algorithm does better at finding defaults, needs to be improved before it can be deployed. Observed late rate is 26%."
print "People on Groundfloor are complaining about late loans. Actual late rate for these is 30%."
print "So want to avoid the risk of late loans."
print "Overall, if I can not add more explanatory variables or engineer more, then I have no confidence in lending on the platform."

B/c the mean interest rate on these loans is 0.14, defaults higher than this put the portfolio at risk of principal loss.

Even though algorithm does better at finding defaults, needs to be improved before it can be deployed. Observed late rate is 26%.
People on Groundfloor are complaining about late loans. Actual late rate for these is 30%.
So want to avoid the risk of late loans.
Overall, if I can not add more explanatory variables or engineer more, then I have no confidence in lending on the platform.


**Next Steps for Loans sans ARV Details**

Blending in 2016 data may not be useful. In fact, keeping 2016 separate as another validation set is valuable.

1) Attempting over-sampling the late loans. If a loan defaults, principal may be recovered but not interest. Or principal may be impaired like one loan. Assume that a default costs 3x a classification of a good loan?

2) So I need to find economic data that is worthwhile.

3) Barring that, I may want to look at the other platforms, even though only for accredited investors.

<h2> Machine Learning on Loans with ARV Details </h2>

We will look at ARV loans and attempt binary classification of on time vs late/default.

In [253]:
groundfloor_v2 = groundfloor_v2.assign(encoded_val_report_source = groundfloor_v2.val_report_source.map({'[u"Broker\'s Price Opinion"]': .75,
       "[u'Certified Independent Appraisal']": 1,
       "[u'Borrower Provided Comps']": .25, "[u'Borrower Provided Appraisal']": .5}))

In [278]:
X = groundfloor_v2[['int_rate', 'ltv', 'term_mo', 'encoded_loan_position',
       'loan_amount', 'investors', 'encoded_grade',
       'purpose_Acquisition & Renovation', 'purpose_New Construction',
       'purpose_Refinance', 'purpose_Renovation', 'loan_tpc_decimal', 'loan_arv_strength',
       'skin_in_game_strength', 'location_strength',
       'borrower_exp_strength', 'borrower_commitment_strength',
     'borrower_margin', 'ipp_to_tpc_decimal','encoded_val_report_source']][groundfloor_v2['repaid_binary'].notnull()][groundfloor_v2['ipp_to_tpc_decimal'].notnull()][groundfloor_v2['encoded_grade']<4][groundfloor_v2.loan_inception_year<=2015]



In [279]:
target = groundfloor_v2['repaid_binary'][groundfloor_v2['repaid_binary'].notnull()][groundfloor_v2['ipp_to_tpc_decimal'].notnull()][groundfloor_v2['encoded_grade']<4][groundfloor_v2.loan_inception_year<=2015]

In [280]:
#Sadly little correlation with margin and repaid status... But still, will nix any loan with too low a margin for borrower.
groundfloor_v2[['borrower_margin', 'ipp_to_tpc_decimal', 'loan_tpc_decimal', 'ltv' ,'repaid_binary', 'skin_in_game_strength']].corr()

Unnamed: 0,borrower_margin,ipp_to_tpc_decimal,loan_tpc_decimal,ltv,repaid_binary,skin_in_game_strength
borrower_margin,1.0,0.322412,0.624648,-0.325583,0.021732,-0.164601
ipp_to_tpc_decimal,0.322412,1.0,0.390317,0.090625,-0.061184,-0.063293
loan_tpc_decimal,0.624648,0.390317,1.0,0.012043,0.031424,-0.294921
ltv,-0.325583,0.090625,0.012043,1.0,-0.199106,-0.655678
repaid_binary,0.021732,-0.061184,0.031424,-0.199106,1.0,0.142913
skin_in_game_strength,-0.164601,-0.063293,-0.294921,-0.655678,0.142913,1.0


In [281]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = .2, stratify = target, random_state = 31)

In [277]:
X[X.ipp_to_tpc_decimal.isnull()][X.loan_tpc_decimal.notnull()]
groundfloor_v2.set_value(109, 'ipp_to_tpc_decimal', 0)

  if __name__ == '__main__':


Unnamed: 0,address,grade,int_rate,link_loan_details,ltv,term_mo,full_address,zipcode,borrower_company,borrower_principal,...,purpose_Refinance,purpose_Renovation,borrower_margin,ipp_to_tpc_decimal,loan_inception_year,loan_inception_month,repaid_status,repaid_multiclass,repaid_binary,encoded_val_report_source
0,17155 Wisconsin Street,C,0.110,/investments/17155-wisconsin-street,0.667,12.0,"17155 Wisconsin Street, Detroit, MI 48221",48221,MPR 2000 Corp,Michael Rivait - principal,...,0.0,0.0,0.185185,0.363636,2016.0,12.0,0 - Repaid_On_Time,0.0,0.0,0.75
1,5914 South Ada Street,D,0.160,/investments/5914-south-ada-street,0.524,12.0,"5914 South Ada Street, Chicago, IL 60636",60636,MBJ Real Estate Consultants Inc.,Michael Christian - principal,...,0.0,1.0,0.409091,0.076923,2017.0,1.0,Current,,,0.75
2,7540 Liberty Avenue,C,0.110,/investments/7540-liberty-avenue,0.402,12.0,"7540 Liberty Avenue, University City, MO 63130",63130,"Goose Creek Homes, LLC",William Delo - principal,...,0.0,1.0,0.515823,0.392157,2017.0,1.0,Current,,,0.75
3,927 Grove Avenue,D,0.140,/investments/927-grove-avenue,0.674,9.0,"927 Grove Avenue, Cocoa, FL 32922",32922,TDH & Family Properties LLC,Timothy Harris - principal,...,0.0,0.0,0.200000,0.683824,2017.0,1.0,Current,,,0.75
4,4315 Treeline Way,C,0.105,/investments/4315-treeline-way-30135,0.606,9.0,"4315 Treeline Way, Douglasville, GA 30135",30135,Investor's Network LLC,Brandon Thompson - principal,...,0.0,0.0,0.393750,0.721649,2017.0,1.0,Current,,,0.75
5,2149 Newnan Street,B,0.096,/investments/2149-newnan-street-30344,0.631,12.0,"2149 Newnan Street, East Point, GA 30344",30344,Sunshine Construction Group,Joseph Michael - principal,...,0.0,0.0,0.327401,0.238663,2017.0,2.0,Current,,,0.75
6,2825 Osage Street,C,0.110,/investments/2825-osage-street,0.550,12.0,"2825 Osage Street, St. Louis, MO 63118",63118,Dutchtown Capital LLC,Kurt Mc Dowell - principal,...,0.0,0.0,0.239437,0.629630,2017.0,1.0,Current,,,0.75
7,7505 East Columbia Drive,A,0.060,/investments/7505-east-columbia-drive,0.397,9.0,"7505 East Columbia Drive, Spokane, WA 99212",99212,Atlantic Property LLC,Matt Chapman - principal,...,0.0,0.0,0.162851,0.839530,2016.0,12.0,Current,,,0.75
8,224 York Street,C,0.110,/investments/2249-onslow-drive-28540,0.440,12.0,"224 - 226 York Street, Jacksonville, NC 28540",28540,Stewardship Home Solutions LLC,George Goddard - principal,...,0.0,0.0,0.363636,0.428571,2016.0,12.0,Current,,,0.75
9,101 Meadow Trail,B,0.085,/investments/101-meadow-trail,0.594,12.0,"101 Meadow Trail, Jacksonville, NC 28546",28546,Blue Skye Properties LLC,Eileen O'neill - principal,...,0.0,0.0,0.317545,0.784601,2016.0,12.0,Current,,,0.75


In [282]:
scores_prelim = {}

#Preliminary models - will go with what has best recall or AUC for optimization of hyperparameters
pipe_logreg_l2 = make_pipeline(MinMaxScaler(), LogisticRegression(class_weight = 'balanced', random_state = 31))
pipe_svc = make_pipeline(MinMaxScaler(), SVC(probability = True, class_weight = 'balanced', random_state = 31))
dt = DecisionTreeClassifier(class_weight = 'balanced', random_state = 31)
dtbag = BaggingClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31, n_estimators = 50)
rf = RandomForestClassifier(random_state = 31, class_weight = 'balanced', n_estimators = 20)
et = ExtraTreesClassifier(random_state = 31, class_weight = 'balanced')
adaboost = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31)
adaboost_point1 = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31, learning_rate = .1)
adaboost_lr_point1 = AdaBoostClassifier(LogisticRegression(class_weight = 'balanced', random_state = 31), random_state = 31, learning_rate = .1)
adaboost_lr = AdaBoostClassifier(LogisticRegression(class_weight = 'balanced', random_state = 31), random_state = 31, learning_rate = 1)
gradient_dev = GradientBoostingClassifier(loss = 'deviance', learning_rate = .1, verbose = False)
gradient_exp = GradientBoostingClassifier(loss = 'exponential', learning_rate = .1, verbose = False)
gradient_exp_1 = GradientBoostingClassifier(loss = 'exponential', learning_rate = 1, verbose = False)



In [283]:

models = [pipe_logreg_l2, pipe_svc, dt, dtbag, rf, et, adaboost, adaboost_point1, adaboost_lr, adaboost_lr_point1, gradient_dev, gradient_exp, gradient_exp_1]
model_names = ['Logistic Regression', 'SVC', 'DT', 'Bagged DT', 'Random Forest', 'Extra Trees', 'AdaBoost', 'AdaBoost 0.1', 'AdaBoost LR', 'AdaBoost LR 0.1','Gradient Deviance', 'Gradient Exp', 'Gradient_Exp_1']

#Fix or just do a damn simple model....
for model, model_name in zip(models, model_names):
    print ""
    print model_name + " Output for Groundfloor Loans from 2014-2015 Tranche"
    print ""
    scores_prelim[model_name] = evaluate_model(model, X_train, y_train)



Logistic Regression Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.533333333333
Mean roc_auc, cross-val, is: 0.5

SVC Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.7
Mean roc_auc, cross-val, is: 0.65

DT Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.8
Mean roc_auc, cross-val, is: 0.8

Bagged DT Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.666666666667
Mean roc_auc, cross-val, is: 0.775

Random Forest Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.9
Mean roc_auc, cross-val, is: 0.8

Extra Trees Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.6
Mean roc_auc, cross-val, is: 0.658333333333

AdaBoost Output for Groundfloor Loans from 2014-2015 Tranche

Mean recall, cross-val, is: 0.8
Mean roc_auc, cross-val, is: 0.85

AdaBoost 0.1 Output for Groundfloor Loans from 2014-2015 

**Gradient, Adaboost LR, Adaboost, RF, DT, SVC **

In [285]:
#SVC
gs_svc = GridSearchCV(pipe_svc, svc_params, cv = 5, scoring = 'recall', verbose = True, n_jobs = -1)
gs_svc.fit(X_train, y_train)

print "Best Recall Score:", gs_svc.best_score_
print "Params:", gs_svc.best_params_

scores_prelim['gs_svc'] = evaluate_model(gs_svc.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.3s finished


Best Recall Score: 1.0
Params: {'svc__C': 0.1, 'svc__max_iter': 30000}
Mean recall, cross-val, is: 1.0
Mean roc_auc, cross-val, is: 0.65


In [286]:
#DT
gs_dtopt = GridSearchCV(dt, dt_params, scoring = 'recall', n_jobs = -1, cv=5, verbose = True)
gs_dtopt.fit(X_train, y_train)
print "Best recall score:", gs_dtopt.best_score_

print "Best params:", gs_dtopt.best_params_
scores_prelim['gs_dtopt'] = evaluate_model(gs_dtopt.best_estimator_, X_train, y_train)
#Interesting - int rate, ltv, term_mo, loan_amount, and purpose_New Construction
pd.DataFrame(X_train.columns.values, gs_dtopt.best_estimator_.feature_importances_)


Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 1440 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:   16.5s finished


Best recall score: 0.833333333333
Best params: {'min_samples_split': 2, 'splitter': 'best', 'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2}
Mean recall, cross-val, is: 0.8
Mean roc_auc, cross-val, is: 0.8


Unnamed: 0,0
0.0,int_rate
0.0,ltv
0.409091,term_mo
0.0,encoded_loan_position
0.0,loan_amount
0.590909,investors
0.0,encoded_grade
0.0,purpose_Acquisition & Renovation
0.0,purpose_New Construction
0.0,purpose_Refinance


In [288]:
#Random Forest
rf_params = {'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2', None], 'bootstrap': [True, False], 'n_estimators': [10, 20, 30, 50, 100, 200]}
gs_rf = GridSearchCV(rf, rf_params, cv = 5, scoring = 'recall', verbose = True, n_jobs = -1)
gs_rf.fit(X_train, y_train)

print "Best recall score:", gs_rf.best_score_
print "Best params:", gs_rf.best_params_

scores_prelim['gs_rf'] = evaluate_model(gs_rf.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 457 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  2.0min finished


Best recall score: 1.0
Best params: {'max_features': 'auto', 'n_estimators': 30, 'bootstrap': True, 'criterion': 'gini'}
Mean recall, cross-val, is: 1.0
Mean roc_auc, cross-val, is: 0.7


In [289]:
pd.DataFrame(gs_rf.best_estimator_.feature_importances_, index = X_train.columns.values)

Unnamed: 0,0
int_rate,0.11047
ltv,0.040839
term_mo,0.080483
encoded_loan_position,0.0
loan_amount,0.098882
investors,0.218089
encoded_grade,0.028763
purpose_Acquisition & Renovation,0.018657
purpose_New Construction,0.0
purpose_Refinance,0.0


In [290]:
#Random Forest with opt dt
rf_optdt = RandomForestClassifier(class_weight = 'balanced', random_state = 31, max_depth = 2, min_samples_split = 2, min_samples_leaf = 2)
gs_rf_opt = GridSearchCV(rf_optdt, rf_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_rf_opt.fit(X_train, y_train)

print "Best recall score:", gs_rf_opt.best_score_
print "Best params:", gs_rf_opt.best_params_

scores_prelim['gs_rf_opt'] = evaluate_model(gs_rf_opt.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   49.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  2.1min finished


Best recall score: 0.904761904762
Best params: {'max_features': 'auto', 'n_estimators': 10, 'bootstrap': True, 'criterion': 'gini'}
Mean recall, cross-val, is: 0.9
Mean roc_auc, cross-val, is: 0.816666666667


In [295]:

#Adaboost with Opt DT params
adaboost = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31, max_depth = 2, min_samples_split = 2, min_samples_leaf = 2), random_state = 31)
adaboost_params = {'n_estimators': [10,20,30,50,100], 'learning_rate': [.1,.3,.5,1,2,5]}
gs_adaboost = GridSearchCV(adaboost, adaboost_params, cv = 5, verbose = True, n_jobs = -1)
gs_adaboost.fit(X_train, y_train)

print "Best recall score:", gs_adaboost.best_score_
print "Best params:", gs_adaboost.best_params_
scores_prelim['gs_adaboost'] = evaluate_model(gs_adaboost.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   10.8s finished


Best recall score: 0.904761904762
Best params: {'n_estimators': 10, 'learning_rate': 0.5}
Mean recall, cross-val, is: 0.9
Mean roc_auc, cross-val, is: 0.9


In [296]:
#Adaboost on own
adaboost_own = AdaBoostClassifier(DecisionTreeClassifier(class_weight = 'balanced', random_state = 31), random_state = 31)
gs_adaboost_own = GridSearchCV(adaboost_own, adaboost_params, cv = 5, scoring = 'recall', verbose = True, n_jobs = -1)
gs_adaboost_own.fit(X_train, y_train)


print "Best recall score:", gs_adaboost_own.best_score_
print "Best params:", gs_adaboost_own.best_params_
scores_prelim['gs_adaboost_own'] = evaluate_model(gs_adaboost_own.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best recall score: 0.833333333333
Best params: {'n_estimators': 10, 'learning_rate': 0.1}
Mean recall, cross-val, is: 0.8
Mean roc_auc, cross-val, is: 0.85


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    1.4s finished


In [294]:
#Adaboost LR
adaboost_lr = make_pipeline(MinMaxScaler(), AdaBoostClassifier(LogisticRegression(class_weight = 'balanced', random_state = 31), random_state = 31))
adaboost_lr_params = {'adaboostclassifier__n_estimators': [10,20,30,50,100], 'adaboostclassifier__learning_rate': [.1,.3,.5,1,2,5]}
gs_adaboost_lr = GridSearchCV(adaboost_lr, adaboost_lr_params, cv = 5, scoring = 'recall', verbose = True, n_jobs = -1)
gs_adaboost_lr.fit(X_train, y_train)


print "Best recall score:", gs_adaboost_lr.best_score_
print "Best params:", gs_adaboost_lr.best_params_
scores_prelim['gs_adaboost_lr'] = evaluate_model(gs_adaboost_lr.best_estimator_, X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   13.6s finished


Best recall score: 0.666666666667
Best params: {'adaboostclassifier__n_estimators': 10, 'adaboostclassifier__learning_rate': 5}
Mean recall, cross-val, is: 0.7
Mean roc_auc, cross-val, is: 0.416666666667


In [297]:
##Gradient Exp - test with opt dt 

gradient_exp_optdt = GradientBoostingClassifier(loss = 'exponential', learning_rate = .1, n_estimators = 200, max_depth = 2, min_samples_leaf = 2, min_samples_split = 2)
gradient_params = {'loss': ['deviance', 'exponential'], 'learning_rate': [.1, .3, .5, 1, 2, 5, 10], 'n_estimators': [10, 20, 50, 100, 200, 300, 400, 500]}
gs_gradient_optdt= GridSearchCV(gradient_exp_optdt, gradient_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_gradient_optdt.fit(X_train, y_train)
#not great at all...
print "Best recall score:", gs_gradient_optdt.best_score_
print "Best params:", gs_gradient_optdt.best_params_

scores_prelim['gs_Gradient_optdt'] = evaluate_model(gs_gradient_optdt.best_estimator_, X_train, y_train)


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   17.4s finished


Best recall score: 0.833333333333
Best params: {'n_estimators': 500, 'loss': 'deviance', 'learning_rate': 5}
Mean recall, cross-val, is: 0.733333333333
Mean roc_auc, cross-val, is: 0.766666666667


In [298]:
#Gradient Exp - test without opt dt.

gradient_exp = GradientBoostingClassifier(loss = 'exponential', learning_rate = .1, n_estimators = 200)
gradient_params = {'loss': ['deviance', 'exponential'], 'learning_rate': [.1, .3, .5, 1, 2, 5, 10], 'n_estimators': [10, 20, 50, 100, 200, 300, 400, 500]}
gs_gradient= GridSearchCV(gradient_exp, gradient_params, cv = 5, verbose = True, scoring = 'recall', n_jobs = -1)
gs_gradient.fit(X_train, y_train)

print "Best recall score:", gs_gradient.best_score_
print "Best params:", gs_gradient.best_params_

scores_prelim['gs_Gradient'] = evaluate_model(gs_gradient.best_estimator_, X_train, y_train)


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    4.1s


Best recall score: 0.833333333333
Best params: {'n_estimators': 10, 'loss': 'exponential', 'learning_rate': 0.1}
Mean recall, cross-val, is: 0.733333333333
Mean roc_auc, cross-val, is: 0.766666666667


[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   15.0s finished


In [299]:
#Best models are... gs_svc, gs_rf, rf, gs_adaboost, gs_rf_opt, gs_dtopt
pd.DataFrame(scores_prelim, columns = scores_prelim.keys(), index = ['recall', 'roc_auc']).T.sort_values('recall', ascending = False)

Unnamed: 0,recall,roc_auc
gs_svc,1.0,0.65
gs_rf,1.0,0.7
Random Forest,0.9,0.8
gs_adaboost,0.9,0.9
gs_rf_opt,0.9,0.816667
gs_dtopt,0.8,0.8
DT,0.8,0.8
gs_adaboost_own,0.8,0.85
AdaBoost 0.1,0.8,0.85
AdaBoost,0.8,0.85


In [301]:
test_scores = {}

#Based above above, I would probably use RF. What about test scores?
models = [gs_svc.best_estimator_, gs_rf.best_estimator_, rf, gs_adaboost.best_estimator_, gs_rf_opt.best_estimator_, gs_dtopt.best_estimator_]
model_names = ['SVC', 'RF Opt sans Opt DT Base', 'RF Unopt', 'Adaboost with Opt DT Base', 'Random Forest, Opt DT Base', 'DT Optimized']

for model, model_name in zip(models, model_names):
    print "Output on Test Data for Groundfloor 2014-15 ARV Tranches for Model %s" % model_name
    test_scores[model] = eval_test_data(model, X_train, y_train, X_test, y_test)
    print ""

Output on Test Data for Groundfloor 2014-15 ARV Tranches for Model SVC
Confusion Matrix
[[3 0]
 [3 0]]
             precision    recall  f1-score   support

        0.0       0.50      1.00      0.67         3
        1.0       0.00      0.00      0.00         3

avg / total       0.25      0.50      0.33         6

Predicted Default Rate: 0.0
Actual Default Rate for the Baseline: 0.5
Observed Default Rate for only lending to those you think are good credits: 0.5
ROC_AUC from test data is: 0.444444444444
Recall from test data is: 0.0

Output on Test Data for Groundfloor 2014-15 ARV Tranches for Model RF Opt sans Opt DT Base


  'precision', 'predicted', average, warn_for)


Confusion Matrix
[[1 2]
 [2 1]]
             precision    recall  f1-score   support

        0.0       0.33      0.33      0.33         3
        1.0       0.33      0.33      0.33         3

avg / total       0.33      0.33      0.33         6

Predicted Default Rate: 0.5
Actual Default Rate for the Baseline: 0.5
Observed Default Rate for only lending to those you think are good credits: 0.666666666667
ROC_AUC from test data is: 0.5
Recall from test data is: 0.333333333333

Output on Test Data for Groundfloor 2014-15 ARV Tranches for Model RF Unopt
Confusion Matrix
[[1 2]
 [2 1]]
             precision    recall  f1-score   support

        0.0       0.33      0.33      0.33         3
        1.0       0.33      0.33      0.33         3

avg / total       0.33      0.33      0.33         6

Predicted Default Rate: 0.5
Actual Default Rate for the Baseline: 0.5
Observed Default Rate for only lending to those you think are good credits: 0.666666666667
ROC_AUC from test data is: 0.555555

**Does not work like this either. In short, too little data despite ARV being interesting to have.**