In [31]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt

In [32]:
all_dummies_scaled = pd.read_csv("preprocessed_data.csv")
all_data = pd.read_csv("all_data.csv")

In [33]:
X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(['train_test'], axis =1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(['train_test'], axis =1)

y_train = all_data[all_data.train_test==1].Survived

# Model Building (Baseline Validation Performance)¶

Before going further, I like to see how various different models perform with default parameters. I tried the following models using 5 fold cross validation to get a baseline. With a validation set basline, we can see how much tuning improves each of the models. Just because a model has a high basline on this validation set doesn't mean that it will actually do better on the eventual test set.

## Naive Bayes (75.3%)
## Logistic Regression (82.0%)
## Decision Tree (76.2%)
## K Nearest Neighbor (50.2%)
## Random Forest (81.5%)
## Support Vector Classifier (61.7%)
## Soft Voting Classifier - All Models (77.6%)

In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [35]:
#I usually use Naive Bayes as a baseline for my classification tasks 

gnb = GaussianNB()
cv = cross_val_score(gnb,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.70224719 0.73595506 0.78089888 0.76404494 0.77966102]
0.7525614168729766


In [36]:
lr = LogisticRegression( max_iter=1000)
cv = cross_val_score(lr,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.81460674 0.80898876 0.80337079 0.82022472 0.85310734]
0.8200596711737447


In [37]:
dt = DecisionTreeClassifier()
cv = cross_val_score(dt,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.62359551 0.79775281 0.80337079 0.78651685 0.83615819]
0.7694788294293151


In [38]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.61797753 0.38202247 0.46067416 0.43258427 0.61581921]
0.5018155272011681


In [39]:
rf = RandomForestClassifier()
cv = cross_val_score(rf,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.80337079 0.79213483 0.8258427  0.78089888 0.84180791]
0.8088110201231512


In [40]:
svc = SVC(probability=True)
cv = cross_val_score(svc,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.61797753 0.61797753 0.61797753 0.61797753 0.61581921]
0.6175458642798197


In [41]:
#Voting classifier takes all of the inputs and averages the results. For a "hard" voting classifier each classifier gets 1 vote "yes" or "no" and the result is just a popular vote. For this, you generally want odd numbers
#A "soft" classifier averages the confidence of each of the models. If a the average confidence is > 50% that it is a 1 it will be counted as such


voting_clf = VotingClassifier(estimators = [('lr',lr),('knn',knn),('rf',rf),('gnb',gnb),
                                            ('svc',svc)], voting = 'soft')
cv = cross_val_score(voting_clf,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.7247191  0.78089888 0.79213483 0.78089888 0.80225989]
0.7761823144797816
