In [100]:
# PCA on the whole dataset
# Categorical variable for age
# Combining classifiers
# Voting Classifier
# Has an infant variable?

In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [102]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)

In [108]:
%run 1_feature_engineering.py
%run 2_test_models.py

In [109]:
train_data = './data/titanic_train.csv'
test_data = './data/titanic_test.csv'

In [110]:
df = import_data(train_data,test_data)
df = feature_engineering(df)
df_train = df[( df['train'] == 1 )]
df_test = df[( df['test'] == 1 )]

In [111]:
sweet_features = [
    'pclass', 'is_alone', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 
    'title_4', 'title_5', 'title_6', 'family_size_1', 'family_size_2', 'family_size_3', 
    'group_size_1', 'group_size_2', 'group_size_3', 'age_scaled'
]

In [115]:
features = df_train[sweet_features]
label = df_train['survived']
test_models(models, sweet_features, features, label, 50, './logs/log.csv')

Logistic Regression Mean: 83.08%
Logistic Regression STD: 7.895
Logistic Regression Min: 66.67%
Logistic Regression Max: 100.0%
SVC Mean: 83.53%
SVC STD: 8.045
SVC Min: 66.67%
SVC Max: 100.0%
Linear SVC Mean: 82.86%
Linear SVC STD: 7.565
Linear SVC Min: 66.67%
Linear SVC Max: 100.0%
Naive Bayes Mean: 82.75%
Naive Bayes STD: 7.845
Naive Bayes Min: 66.67%
Naive Bayes Max: 100.0%
KNN Mean: 82.87%
KNN STD: 8.835
KNN Min: 61.11%
KNN Max: 100.0%
Decision Tree Mean: 82.07%
Decision Tree STD: 8.525
Decision Tree Min: 61.11%
Decision Tree Max: 100.0%
Random Forest Mean: 82.07%
Random Forest STD: 9.015
Random Forest Min: 61.11%
Random Forest Max: 100.0%
Gradient Boosting Mean: 83.65%
Gradient Boosting STD: 8.525
Gradient Boosting Min: 66.67%
Gradient Boosting Max: 100.0%
MLP Mean: 83.53%
MLP STD: 7.965
MLP Min: 66.67%
MLP Max: 100.0%


#### SVM Hyperparameter Tuning

In [116]:
param_grid = {
    'kernel': ['rbf', 'sigmoid'],
    'C': [1,10,100,1000],
    'gamma': ['auto', 1, 0.1, 0.001, 0.0001],
    'shrinking': [True, False],
    'decision_function_shape': ['ovr', 'ovo']
}
model = GridSearchCV(
    SVC(),
    param_grid,
    verbose=1, 
    cv=50
)
model.fit(features, label)
print('Score: ', model.best_score_)
print('Params: ', model.best_params_)
print('Estimator: ', model.best_estimator_)

Fitting 50 folds for each of 160 candidates, totalling 8000 fits
Score:  0.8383838383838383
Params:  {'C': 1, 'decision_function_shape': 'ovr', 'gamma': 1, 'kernel': 'rbf', 'shrinking': True}
Estimator:  SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


[Parallel(n_jobs=1)]: Done 8000 out of 8000 | elapsed:  8.9min finished


#### Linear SVM Hyperparameter Tuning

In [93]:
param_grid = [{
    'penalty': ['l1'],
    'loss': ['squared_hinge'],
    'dual': [False],
    'C':[1,10,100,1000],
    'class_weight': ['balanced', None]      
}, {
    'penalty': ['l2'],
    'loss': ['hinge'],
    'dual': [True],
    'C':[1,10,100,1000],
    'class_weight': ['balanced', None]    
}]
model = GridSearchCV(
    LinearSVC(),
    param_grid,
    verbose=1, 
    cv=50
)

model.fit(features, label)
print('Score: ', model.best_score_)
print('Params: ', model.best_params_)
print('Estimator: ', model.best_estimator_)

Fitting 50 folds for each of 16 candidates, totalling 800 fits
Score:  0.8282828282828283
Params:  {'C': 1, 'class_weight': None, 'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}
Estimator:  LinearSVC(C=1, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)


[Parallel(n_jobs=1)]: Done 800 out of 800 | elapsed:   18.5s finished


#### Logistic Regression Hyperparameter Tuning

In [82]:
param_grid = {
    'C':[1,10,100,1000],
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'multi_class': ['ovr', 'multinomial']        
}
model = GridSearchCV(
    LogisticRegression(),
    param_grid,
    verbose=1, 
    cv=50
)

model.fit(features, label)
print('Score: ', model.best_score_)
print('Params: ', model.best_params_)
print('Estimator: ', model.best_estimator_)

Fitting 50 folds for each of 32 candidates, totalling 1600 fits
Score:  0.8305274971941639
Params:  {'C': 1, 'multi_class': 'ovr', 'solver': 'newton-cg'}
Estimator:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)


[Parallel(n_jobs=1)]: Done 1600 out of 1600 | elapsed:   59.7s finished


#### MLP Hyperparameter Tuning

In [76]:
param_grid = {
    'solver': ['lbfgs','adam'], 
    'max_iter': [500,1000,1500], 
    'alpha': 10.0 ** -np.arange(1, 7), 
    'hidden_layer_sizes': np.arange(3, 8)
}

model = GridSearchCV(
    MLPClassifier(),
    param_grid,
    verbose=1, 
    cv=3
)

model.fit(features, label)
print('Score: ', model.best_score_)
print('Params: ', model.best_params_)
print('Estimator: ', model.best_estimator_)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed:  2.6min finished


Score:  0.8372615039281706
Params:  {'alpha': 0.1, 'hidden_layer_sizes': 4, 'max_iter': 500, 'solver': 'adam'}
Estimator:  MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=4, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


#### Gradient Boosting Hyperparameter Tuning

In [125]:
param_grid = {
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_depth': [4, 6, 8],
    'min_samples_leaf': [20, 50,100,150],
    'max_features': [1.0, 0.3, 0.1] 
}

model = GridSearchCV(
    GradientBoostingClassifier(),
    param_grid,
    verbose=1, 
    cv=10
)

model.fit(features, label)
print('Score: ', model.best_score_)
print('Params: ', model.best_params_)
print('Estimator: ', model.best_estimator_)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Score:  0.8518518518518519
Params:  {'learning_rate': 0.1, 'max_depth': 6, 'max_features': 1.0, 'min_samples_leaf': 20}
Estimator:  GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=1.0, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=20, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


[Parallel(n_jobs=1)]: Done 1440 out of 1440 | elapsed:  1.8min finished


#### Final Models

In [118]:
# Logistic Model

model = LogisticRegression(
    C=1, 
    multi_class='ovr',
    solver='newton-cg'
)

model.fit(features, label)

df_sub = pd.DataFrame()
df_sub['PassengerId'] = df_test['passengerid']
df_sub['Survived'] = model.predict(df_test[sweet_features])
sub_path = './data/submission_logistic.csv'
df_sub.to_csv(sub_path, index=False)

In [123]:
# SVC Model

model = SVC(
    C=1, 
    decision_function_shape='ovr',
    gamma=1,
    kernel='rbf',
    shrinking=True
)

model.fit(features, label)

df_sub = pd.DataFrame()
df_sub['PassengerId'] = df_test['passengerid']
df_sub['Survived'] = model.predict(df_test[sweet_features])
sub_path = './data/submission_SVC.csv'
df_sub.to_csv(sub_path, index=False)

In [122]:
# Linear SVM Model

model = LinearSVC(
    C=1,
    dual=False,
    loss='squared_hinge',
    penalty='l1'
)

model.fit(features, label)

df_sub = pd.DataFrame()
df_sub['PassengerId'] = df_test['passengerid']
df_sub['Survived'] = model.predict(df_test[sweet_features])
sub_path = './data/submission_linearSVC.csv'
df_sub.to_csv(sub_path, index=False)

In [None]:
# Gradient Boosting Classifier

model = GradientBoostingClassifier(
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=20,
    max_features=1.0,
)

model.fit(features, label)

df_sub = pd.DataFrame()
df_sub['PassengerId'] = df_test['passengerid']
df_sub['Survived'] = model.predict(df_test[sweet_features])
sub_path = './data/submission_GBC.csv'
df_sub.to_csv(sub_path, index=False)