## Read in preprocessed data

In [18]:
import pandas as pd

X_train = pd.read_pickle('../data/X_train_v3.pkl')
y_train = pd.read_pickle('../data/y_train_v3.pkl')

_______________________________________________________________________________________________________________________________

## Build Models

In [19]:
#Import cross validation and optimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#Import models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from multiprocessing import cpu_count

In [20]:
#Create a dictionary to hold models
models = {}

In [21]:
#Create function to add model and metrics to dictionary
def model_eval(model, name, X_train, y_train, cv):
    #Fit model
    model.fit(X_train, y_train)
    
    #Create predictions
    y_pred = model.predict(X_train)
    
    #Create cross validation scores
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv)
    
    #Create cross validation predictions
    cv_pred = cross_val_predict(model, X_train, y_train, cv=cv)
    
    #Create metrics
    accuracy = accuracy_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred)
    recall = recall_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    roc_auc = roc_auc_score(y_train, y_pred)
    
    #Add metrics to dictionary
    models[name] = {'model': model,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'roc_auc': roc_auc,
                    'cv_score': cv_scores.mean(),
                    }
    
    #Print metrics
    print(name)
    print('Accuracy: ', accuracy)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print

In [22]:
#Instantiate models
rf_clf = RandomForestClassifier()
log_reg = LogisticRegression()
gbc = GradientBoostingClassifier()
svc_clf = SVC()
sgd_clf = SGDClassifier()
neigh_clf = KNeighborsClassifier()
dct_clf = DecisionTreeClassifier()
xgb_clf = XGBClassifier()

In [23]:
#Create a list of models
model_list = [rf_clf, log_reg, gbc, svc_clf, sgd_clf, neigh_clf, dct_clf, xgb_clf]

In [24]:
#iterate through list of models, evaluate, and add to dictionary
for model in model_list:
    model_eval(model, model.__class__.__name__, X_train, y_train, 5)

RandomForestClassifier
Accuracy:  0.9797979797979798
Precision:  0.9764705882352941
Recall:  0.9707602339181286
LogisticRegression
Accuracy:  0.8058361391694725
Precision:  0.7717041800643086
Recall:  0.7017543859649122
GradientBoostingClassifier
Accuracy:  0.8911335578002245
Precision:  0.9016393442622951
Recall:  0.804093567251462
SVC
Accuracy:  0.8372615039281706
Precision:  0.8272425249169435
Recall:  0.7280701754385965
SGDClassifier
Accuracy:  0.7755331088664422
Precision:  0.6972222222222222
Recall:  0.7339181286549707
KNeighborsClassifier
Accuracy:  0.8574635241301908
Precision:  0.8369905956112853
Recall:  0.7807017543859649
DecisionTreeClassifier
Accuracy:  0.9797979797979798
Precision:  0.9939024390243902
Recall:  0.9532163742690059
XGBClassifier
Accuracy:  0.9640852974186308
Precision:  0.9696969696969697
Recall:  0.935672514619883


In [25]:
models

{'RandomForestClassifier': {'model': RandomForestClassifier(),
  'accuracy': 0.9797979797979798,
  'precision': 0.9764705882352941,
  'recall': 0.9707602339181286,
  'f1': 0.9736070381231672,
  'roc_auc': 0.9780941424599752,
  'cv_scores': array([0.77653631, 0.80337079, 0.83146067, 0.78089888, 0.82022472]),
  'cv_pred': array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
         1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
         1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
         1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
         0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
         0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
     