In [96]:
import pandas as pd
import numpy as np
import time

np.random.seed(42)

X_train = pd.read_pickle('../data/X_train_v2.pkl')
y_train = pd.read_pickle('../data/y_train_v3.pkl')

_______________________________________________________________________________________________________________________________

## Build Models

In [97]:
#Import cross validation and optimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

#Import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#Import models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from multiprocessing import cpu_count


In [98]:
#Create a dictionary to hold models
models = {}

In [99]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [100]:
#Create function to add model and metrics to dictionary using split data
def train_model(model, name):
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    models[name] = {'model': model,
                    'train_time': end-start,
                    'train_accuracy': model.score(X_train, y_train),
                    'test_accuracy': model.score(X_test, y_test),
                    'test_precision': precision_score(y_test, model.predict(X_test)),
                    'test_recall': recall_score(y_test, model.predict(X_test)),
                    'test_f1': f1_score(y_test, model.predict(X_test)),
                    'test_roc_auc': roc_auc_score(y_test, model.predict(X_test)),
                   }


In [101]:
#Instantiate models
rf_clf = RandomForestClassifier()
log_reg = LogisticRegression()
gbc = GradientBoostingClassifier()
svc_clf = SVC()
sgd_clf = SGDClassifier()
neigh_clf = KNeighborsClassifier()
dct_clf = DecisionTreeClassifier()
xgb_clf = XGBClassifier()

In [102]:
#Create a list of models
model_list = [rf_clf, log_reg, gbc, svc_clf, sgd_clf, neigh_clf, dct_clf, xgb_clf]

In [103]:
#iterate through list of models, evaluate, and add to dictionary
for model in model_list:
    train_model(model, model.__class__.__name__)

In [104]:
#For each metric, select the best model
for metric in ['train_accuracy', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_roc_auc']:
    best_score = 0
    best_model = ''
    for model in models:
        if models[model][metric] > best_score:
            best_score = models[model][metric]
            best_model = model
    print(f'Best model for {metric} is {best_model} with a score of {best_score}')

Best model for train_accuracy is RandomForestClassifier with a score of 0.9789325842696629
Best model for test_accuracy is GradientBoostingClassifier with a score of 0.8156424581005587
Best model for test_precision is GradientBoostingClassifier with a score of 0.8153846153846154
Best model for test_recall is RandomForestClassifier with a score of 0.7702702702702703
Best model for test_f1 is RandomForestClassifier with a score of 0.7702702702702703
Best model for test_roc_auc is RandomForestClassifier with a score of 0.8041827541827542


In [105]:
models['RandomForestClassifier']

{'model': RandomForestClassifier(),
 'train_time': 0.19459104537963867,
 'train_accuracy': 0.9789325842696629,
 'test_accuracy': 0.8100558659217877,
 'test_precision': 0.7702702702702703,
 'test_recall': 0.7702702702702703,
 'test_f1': 0.7702702702702703,
 'test_roc_auc': 0.8041827541827542}

In [106]:
#Get confusion matrix for RandomForestClassifier
confusion_matrix(y_test, models['RandomForestClassifier']['model'].predict(X_test))

array([[88, 17],
       [17, 57]])

In [107]:
test = pd.read_csv('../data/test.csv')

#Create dataframe of test data
X_test = pd.read_pickle('../data/X_test_v2.pkl')

#Get shape of test data
X_test.shape

(418, 13)

In [108]:
#Make predictions
predictions = models['RandomForestClassifier']['model'].predict(X_test)


In [109]:
#Create submission dataframe
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})

#Save submission dataframe to csv
submission.to_csv('../data/submission.csv', index=False)