## Read in preprocessed data

In [1]:
import pandas as pd

X_train = pd.read_pickle('../data/X_train_v3.pkl')
y_train = pd.read_pickle('../data/y_train_v3.pkl')

_______________________________________________________________________________________________________________________________

## Build Models

In [2]:
#Import cross validation and optimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#Import models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from multiprocessing import cpu_count

In [12]:
#Create a dictionary to hold models
models = {}

In [None]:
#Create function to add model and metrics to dictionary
def model_eval(model, name, X_train, y_train, cv):
    #Fit model
    model.fit(X_train, y_train)
    
    #Create predictions
    y_pred = model.predict(X_train)
    
    #Create cross validation scores
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv)
    
    #Create cross validation predictions
    cv_pred = cross_val_predict(model, X_train, y_train, cv=cv)
    
    #Create metrics
    accuracy = accuracy_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred)
    recall = recall_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    roc_auc = roc_auc_score(y_train, y_pred)
    
    #Add metrics to dictionary
    models[name] = {'model': model,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'roc_auc': roc_auc,
                    'cv_scores': cv_scores,
                    'cv_pred': cv_pred}
    
    #Print metrics
    print(name)
    print('Accuracy: ', accuracy)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print

### Random Forest Classifier

In [8]:
#Create model
rf_clf = RandomForestClassifier()

#Fit the model
rf_clf.fit(X_train, y_train)


In [11]:
#Get classification report
rf_clf_report = classification_report(y_train, rf_clf.predict(X_train))
print(rf_clf_report)

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       549
           1       0.99      0.96      0.97       342

    accuracy                           0.98       891
   macro avg       0.98      0.98      0.98       891
weighted avg       0.98      0.98      0.98       891

