## Read in preprocessed data

In [90]:
import pandas as pd
import numpy as np
import time

np.random.seed(42)

X_train = pd.read_pickle('../data/X_train_v3.pkl')
y_train = pd.read_pickle('../data/y_train_v3.pkl')

_______________________________________________________________________________________________________________________________

## Build Models

In [91]:
#Import cross validation and optimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

#Import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#Import models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from multiprocessing import cpu_count


In [92]:
#Create a dictionary to hold models
models = {}

In [93]:
#Create function to add model and metrics to dictionary
def model_eval(model, name, X_test, y_test, cv):
    #Fit model
    model.fit(X_train, y_train)
    
    #Create predictions
    y_pred = model.predict(X_train)
    
    #Create cross validation scores
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, n_jobs=cpu_count())
    
    #Create cross validation predictions
    cv_pred = cross_val_predict(model, X_train, y_train, cv=cv, n_jobs=cpu_count())
    
    #Create metrics
    accuracy = accuracy_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred)
    recall = recall_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    roc_auc = roc_auc_score(y_train, y_pred)
    
    #Add metrics to dictionary
    models[name] = {'model': model,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'roc_auc': roc_auc,
                    'cv_score': cv_scores.mean(),
                    }
    


In [94]:
#Instantiate models
rf_clf = RandomForestClassifier()
log_reg = LogisticRegression()
gbc = GradientBoostingClassifier()
svc_clf = SVC()
sgd_clf = SGDClassifier()
neigh_clf = KNeighborsClassifier()
dct_clf = DecisionTreeClassifier()
xgb_clf = XGBClassifier()

In [95]:
#Create a list of models
model_list = [rf_clf, log_reg, gbc, svc_clf, sgd_clf, neigh_clf, dct_clf, xgb_clf]

In [96]:
#iterate through list of models, evaluate, and add to dictionary
for model in model_list:
    model_eval(model, model.__class__.__name__, X_train, y_train, 10)

In [97]:
#For each metric, select the best model
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'cv_score']:
    best_score = 0
    for model in models:
        if models[model][metric] > best_score:
            best_score = models[model][metric]
            best_model = model
    print('Best {}: {}'.format(metric, best_model))

Best accuracy: RandomForestClassifier
Best precision: DecisionTreeClassifier
Best recall: RandomForestClassifier
Best f1: RandomForestClassifier
Best roc_auc: RandomForestClassifier
Best cv_score: GradientBoostingClassifier


In [98]:
print(models['RandomForestClassifier'])

{'model': RandomForestClassifier(), 'accuracy': 0.9797979797979798, 'precision': 0.990909090909091, 'recall': 0.956140350877193, 'f1': 0.9732142857142858, 'roc_auc': 0.9753379350014381, 'cv_score': 0.8081772784019975}


In [99]:
#Get confusion matrix for RandomForestClassifier
y_pred = models['RandomForestClassifier']['model'].predict(X_train)
confusion_matrix(y_train, y_pred)

array([[546,   3],
       [ 15, 327]])

In [100]:
#Get models from dict with roc_auc > 0.95
best_models = {}
for model in models:
    if models[model]['roc_auc'] > 0.95:
        best_models[model] = models[model]


In [101]:
#Print model and roc_auc
for model in best_models:
    print(model)
    print('roc_auc: ', best_models[model]['roc_auc'])
    print

RandomForestClassifier
roc_auc:  0.9753379350014381
DecisionTreeClassifier
roc_auc:  0.9747866935097307
XGBClassifier
roc_auc:  0.95872878918608


In [102]:
#Print accuracy score for models
for model in models:
    print(model)
    print('accuracy: ', models[model]['accuracy'])

RandomForestClassifier
accuracy:  0.9797979797979798
LogisticRegression
accuracy:  0.8058361391694725
GradientBoostingClassifier
accuracy:  0.8911335578002245
SVC
accuracy:  0.8372615039281706
SGDClassifier
accuracy:  0.6464646464646465
KNeighborsClassifier
accuracy:  0.8574635241301908
DecisionTreeClassifier
accuracy:  0.9797979797979798
XGBClassifier
accuracy:  0.9640852974186308


In [103]:
#Print f1 score for models
for model in models:
    print(model)
    print('f1: ', models[model]['f1'])

RandomForestClassifier
f1:  0.9732142857142858
LogisticRegression
f1:  0.7350689127105667
GradientBoostingClassifier
f1:  0.8500772797527048
SVC
f1:  0.7744945567651633
SGDClassifier
f1:  0.6181818181818182
KNeighborsClassifier
f1:  0.8078668683812406
DecisionTreeClassifier
f1:  0.973134328358209
XGBClassifier
f1:  0.9523809523809523


In [104]:
#Print cv_score for each model
for model in models:
    print(model)
    print('cv_score: ', models[model]['cv_score'])
    print

RandomForestClassifier
cv_score:  0.8081772784019975
LogisticRegression
cv_score:  0.8002372034956304
GradientBoostingClassifier
cv_score:  0.8294506866416977
SVC
cv_score:  0.8226966292134831
SGDClassifier
cv_score:  0.7665043695380774
KNeighborsClassifier
cv_score:  0.8069912609238452
DecisionTreeClassifier
cv_score:  0.7901498127340825
XGBClassifier
cv_score:  0.8125967540574284


In [105]:
#Make predictions on test set
X_test = pd.read_pickle('../data/X_test_v3.pkl')

y_pred = rf_clf.predict(X_test)

In [106]:
test = pd.read_csv('../data/test.csv')

#Create submission file
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_pred})

submission.to_csv('../data/submission_3.csv', index=False)