In [1]:
# PCA on the whole dataset
# Categorical variable for age
# Combining classifiers
# Voting Classifier
# Has an infant variable?

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)

In [4]:
%run 1_feature_engineering.py
%run 2_test_models.py

In [5]:
train_data = './data/titanic_train.csv'
test_data = './data/titanic_test.csv'

In [6]:
df = import_data(train_data,test_data)
df = feature_engineering(df)
df_train = df[( df['train'] == 1 )]
df_test = df[( df['test'] == 1 )]

In [68]:
sweet_features = [
    'pclass', 'is_alone', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 
    'title_4', 'title_5', 'title_6', 'family_size_1', 'family_size_2', 'family_size_3', 
    'group_size_1', 'group_size_2', 'group_size_3', 'age_scaled'
]

In [81]:
features = df_train[sweet_features]
label = df_train['survived']
test_models(models, sweet_features, features, label, 5, './logs/log.csv')

Logistic Regression Performance: 83.05%
SVC Performance: 83.39%
Linear SVC Performance: 82.72%
Naive Bayes Performance: 81.7%
KNN Performance: 82.39%
Decision Tree Performance: 81.82%
Random Forest Performance: 82.61%
Gradient Boosting Performance: 83.73%
MLP Performance: 83.62%


#### SVM Hyperparameter Tuning

In [73]:
param_grid = {
    'kernel': ['rbf', 'sigmoid']
    'C': [1,10,100,1000],
    'gamma': ['auto', 1, 0.1, 0.001, 0.0001],
    'shrinking': [True, False]
}
model = GridSearchCV(
    SVC(),
    param_grid,
    verbose=1, 
    cv=5
)
model.fit(features, label)
print('Score: ', model.best_score_)
print('Params: ', model.best_params_)
print('Estimator: ', model.best_estimator_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Score:  0.8338945005611672
Params:  {'C': 1, 'gamma': 0.1}
Estimator:  SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    4.0s finished


#### Linear SVM Hyperparameter Tuning

In [93]:
param_grid = [{
    'penalty': ['l1'],
    'loss': ['squared_hinge'],
    'dual': [False],
    'C':[1,10,100,1000],
    'class_weight': ['balanced', None]      
}, {
    'penalty': ['l2'],
    'loss': ['hinge'],
    'dual': [True],
    'C':[1,10,100,1000],
    'class_weight': ['balanced', None]    
}]
model = GridSearchCV(
    LinearSVC(),
    param_grid,
    verbose=1, 
    cv=50
)

model.fit(features, label)
print('Score: ', model.best_score_)
print('Params: ', model.best_params_)
print('Estimator: ', model.best_estimator_)

Fitting 50 folds for each of 16 candidates, totalling 800 fits
Score:  0.8282828282828283
Params:  {'C': 1, 'class_weight': None, 'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}
Estimator:  LinearSVC(C=1, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)


[Parallel(n_jobs=1)]: Done 800 out of 800 | elapsed:   18.5s finished


#### Logistic Regression Hyperparameter Tuning

In [82]:
param_grid = {
    'C':[1,10,100,1000],
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'multi_class': ['ovr', 'multinomial']        
}
model = GridSearchCV(
    LogisticRegression(),
    param_grid,
    verbose=1, 
    cv=50
)

model.fit(features, label)
print('Score: ', model.best_score_)
print('Params: ', model.best_params_)
print('Estimator: ', model.best_estimator_)

Fitting 50 folds for each of 32 candidates, totalling 1600 fits
Score:  0.8305274971941639
Params:  {'C': 1, 'multi_class': 'ovr', 'solver': 'newton-cg'}
Estimator:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)


[Parallel(n_jobs=1)]: Done 1600 out of 1600 | elapsed:   59.7s finished


#### MLP Hyperparameter Tuning

In [76]:
param_grid = {
    'solver': ['lbfgs','adam'], 
    'max_iter': [500,1000,1500], 
    'alpha': 10.0 ** -np.arange(1, 7), 
    'hidden_layer_sizes': np.arange(3, 8)
}

model = GridSearchCV(
    MLPClassifier(),
    param_grid,
    verbose=1, 
    cv=3
)

model.fit(features, label)
print('Score: ', model.best_score_)
print('Params: ', model.best_params_)
print('Estimator: ', model.best_estimator_)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed:  2.6min finished


Score:  0.8372615039281706
Params:  {'alpha': 0.1, 'hidden_layer_sizes': 4, 'max_iter': 500, 'solver': 'adam'}
Estimator:  MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=4, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
