In [113]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 50

%matplotlib notebook

In [2]:
train = pd.read_csv('data/preprocessed_train.csv')

test = pd.read_csv('data/preprocessed_test.csv')

submission = pd.DataFrame({'PassengerId': pd.read_csv('data/test.csv')['PassengerId']})

In [3]:
y = train['Survived']

train = train.drop(columns='Survived')

train.head()

Unnamed: 0,Pclass,Age,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,is_alone
0,1,0.271174,0.014151,0,1,0,0,1,0,0,1,0,0,0
1,3,0.472229,0.139136,1,0,1,0,0,0,0,0,1,0,0
2,1,0.321438,0.015469,1,0,0,0,1,0,1,0,0,0,1
3,3,0.434531,0.103644,1,0,0,0,1,0,0,0,1,0,0
4,1,0.434531,0.015713,0,1,0,0,1,0,0,1,0,0,1


#### Dive data in train and validation: on train tune hyperparams and on validation check tuned model

In [4]:
X_train, X_val, y_train, y_val = train_test_split(train, y, stratify=y, test_size=0.33)

In [5]:
X_train = X_train.reset_index().drop(columns='index')

X_val = X_val.reset_index().drop(columns='index')

In [6]:
y_train = y_train.reset_index().drop(columns='index')

y_val = y_val.reset_index().drop(columns='index')

In [7]:
skf = StratifiedKFold(n_splits=5, random_state=42)

### Logistic Regression

In [10]:
%%time


model = LogisticRegression(random_state=42, n_jobs=-1)

params = {
    'C': [0.45, 0.5, 0.55],
    'solver': ['liblinear'],
}

grid_cv = GridSearchCV(model, params, cv=skf, scoring='accuracy')

grid_cv.fit(X_train, y_train)

print('Best score on CV: ' + str(grid_cv.best_score_))
print('Params of best model: ' + str(grid_cv.best_params_))

Best score on CV: 0.8036912751677853
Params of best model: {'C': 0.55, 'solver': 'liblinear'}
Wall time: 132 ms


In [11]:
log_reg = LogisticRegression(C=0.5, solver='liblinear',class_weight=None, random_state=42, n_jobs=-1)

log_reg.fit(X_train, y_train)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
print(accuracy_score(y_true=y_train, y_pred=log_reg.predict(X_train)))

print(accuracy_score(y_true=y_val, y_pred=log_reg.predict(X_val)))

0.8104026845637584
0.8271186440677966


In [13]:
confusion_matrix(y_val, log_reg.predict(X_val))

array([[158,  24],
       [ 27,  86]], dtype=int64)

In [14]:
log_reg = LogisticRegression(C=0.5, solver='liblinear', class_weight=None, random_state=42, n_jobs=-1)

log_reg.fit(train, y)

# submission['Survived'] = log_reg.predict(test)

# submission.to_csv('predictions/submission_log_reg.csv', index=False)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### SVM

In [15]:
%%time


model = SVC(random_state=42)

params = {
    'C': [2, 8, 9, 10, 11, 12],
    'kernel':  ('linear', 'poly', 'rbf', 'sigmoid'),
    'degree': (2, 3, 4, 5),
}

grid_cv = GridSearchCV(model, params, cv=skf, scoring='accuracy')

grid_cv.fit(X_train, y_train)

print('Best score on CV: ' + str(grid_cv.best_score_))
print('Params of best model: ' + str(grid_cv.best_params_))

Best score on CV: 0.8221476510067114
Params of best model: {'C': 10, 'degree': 2, 'kernel': 'poly'}
Wall time: 9.56 s


In [28]:
svc = SVC(kernel='poly', degree=2, C=4, random_state=42)

svc.fit(X_train, y_train)

SVC(C=4, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
print(accuracy_score(y_true=y_train, y_pred=svc.predict(X_train)))

print(accuracy_score(y_true=y_val, y_pred=svc.predict(X_val)))

0.8288590604026845
0.8135593220338984


In [30]:
confusion_matrix(y_val, svc.predict(X_val))

array([[170,  12],
       [ 43,  70]], dtype=int64)

In [32]:
svc = SVC(kernel='poly', degree=2, C=4, random_state=42)

svc.fit(train, y)

submission['Survived'] = svc.predict(test)

submission.to_csv('predictions/submission_svc.csv', index=False)

### RandomForest

In [33]:
%%time


model = RandomForestClassifier(random_state=42, n_jobs=-1)

params = {
    'n_estimators':[15, 25, 35, 45],
    'max_depth':[8],
    'min_samples_leaf': [10],
}

grid_cv = GridSearchCV(model, params, cv=skf, scoring='accuracy')

grid_cv.fit(X_train, y_train)

print('Best score on CV: ' + str(grid_cv.best_score_))
print('Params of best model: ' + str(grid_cv.best_params_))

Best score on CV: 0.7919463087248322
Params of best model: {'max_depth': 8, 'n_estimators': 15, 'min_samples_leaf': 10}
Wall time: 7.32 s


In [98]:
forest = RandomForestClassifier(min_samples_leaf=10, n_estimators=30, max_depth=8, random_state=42, n_jobs=-1)

forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [99]:
print(accuracy_score(y_true=y_train, y_pred=forest.predict(X_train)))

print(accuracy_score(y_true=y_val, y_pred=forest.predict(X_val)))

0.8288590604026845
0.8271186440677966


In [100]:
confusion_matrix(y_val, forest.predict(X_val))

array([[170,  12],
       [ 39,  74]], dtype=int64)

In [101]:
forest = RandomForestClassifier(min_samples_leaf=10, n_estimators=30, max_depth=8, random_state=42, n_jobs=-1)

forest.fit(train, y)

submission['Survived'] = forest.predict(test)

submission.to_csv('predictions/submission_forest.csv', index=False)

### ExtraTrees

In [274]:
%%time


model = ExtraTreesClassifier(random_state=42, n_jobs=-1)

params = {
    'n_estimators':[45, 50, 55, 60],
    'max_depth':[8],
    'min_samples_leaf': [2, 3, 5],
    'min_samples_split': [2,3,4,5]
}

grid_cv = GridSearchCV(model, params, cv=skf, scoring='accuracy')

grid_cv.fit(X_train, y_train)

print('Best score on CV: ' + str(grid_cv.best_score_))
print('Params of best model: ' + str(grid_cv.best_params_))

Best score on CV: 0.8356741573033708
Params of best model: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 8}
Wall time: 1min 34s


In [110]:
extra = ExtraTreesClassifier(min_samples_leaf=10, n_estimators=20, max_depth=8, random_state=42)

extra.fit(X_train, y_train)

print(accuracy_score(y_true=y_train, y_pred=extra.predict(X_train)))

print(accuracy_score(y_true=y_val, y_pred=extra.predict(X_val)))

0.8305369127516778
0.8305084745762712


In [111]:
confusion_matrix(y_val, extra.predict(X_val))

array([[166,  16],
       [ 34,  79]], dtype=int64)

In [112]:
extra = ExtraTreesClassifier(min_samples_leaf=10, n_estimators=20, max_depth=8, random_state=42)

extra.fit(train, y)

submission['Survived'] = extra.predict(test)

submission.to_csv('predictions/submission_extra.csv', index=False)

### XGBoost Classifier

In [120]:
%%time


model = XGBClassifier(n_jobs=-1, random_state=42, silent=False)
params = {
    'n_estimators': [20, 30, 50],
    'max_depth': [8],
    'learning_rate': [0.2],
    'booster': ['gbtree'],
    'eval_metric': 'error'
}

grid_cv = GridSearchCV(model, params, cv=skf, scoring='accuracy')

grid_cv.fit(X_train, y_train)

print('Best score on CV: ' + str(grid_cv.best_score_))
print('Params of best model: ' + str(grid_cv.best_params_))

[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning e

[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning e

[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning e

[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=8
[17:59:29] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning e

[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=7
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=7
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning e

[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=8
[17:59:30] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning e

In [454]:
xgboost = XGBClassifier(n_estimators=15, max_depth=5, tree_method='hist', 
                        learning_rate=0.15, min_split_loss=4, min_child_weight=5,
                        random_state=42, n_jobs=-1, reg_lambda=5, reg_alpha=2.5)

xgboost.fit(X_train, y_train)

print(accuracy_score(y_true=y_train, y_pred=xgboost.predict(X_train)))

print(accuracy_score(y_true=y_val, y_pred=xgboost.predict(X_val)))

[19:01:32] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.8221476510067114
0.823728813559322


In [455]:
confusion_matrix(y_val, xgboost.predict(X_val))

array([[159,  23],
       [ 29,  84]], dtype=int64)

In [456]:
xgboost = XGBClassifier(n_estimators=15, max_depth=5, tree_method='hist', 
                        learning_rate=0.15, min_split_loss=4, min_child_weight=5,
                        random_state=42, n_jobs=-1, reg_lambda=5, reg_alpha=2.5)

xgboost.fit(train, y)

submission['Survived'] = xgboost.predict(test)

submission.to_csv('predictions/submission_xgboost.csv', index=False)

[19:02:03] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.


### Blending

In [464]:
for dataset in [X_train, X_val, train, test]:    
    dataset['log_reg'] = log_reg.predict(dataset[['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C',
                                                   'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr',
                                                   'Title_Mrs', 'Title_Other', 'is_alone']])
    dataset['svc'] = svc.predict(dataset[['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C',
                                          'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr',
                                          'Title_Mrs', 'Title_Other', 'is_alone']])
    dataset['forest'] = forest.predict(dataset[['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C',
                                                'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr',
                                                'Title_Mrs', 'Title_Other', 'is_alone']])
    dataset['extra'] = extra.predict(dataset[['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C',
                                              'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr',
                                              'Title_Mrs', 'Title_Other', 'is_alone']])
    dataset['xgboost'] = xgboost.predict(dataset[['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C',
                                                  'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr',
                                                  'Title_Mrs', 'Title_Other', 'is_alone']])

In [516]:
meta_alg = RandomForestClassifier(min_samples_leaf=10, n_estimators=20, max_depth=8, random_state=42, n_jobs=-1)

meta_alg.fit(X_train, y_train)

print(accuracy_score(y_true=y_train, y_pred=meta_alg.predict(X_train)))

print(accuracy_score(y_true=y_val, y_pred=meta_alg.predict(X_val)))

0.8473154362416108
0.8440677966101695


In [517]:
confusion_matrix(y_val, meta_alg.predict(X_val))

array([[173,   9],
       [ 37,  76]], dtype=int64)

In [524]:
meta_alg = RandomForestClassifier(min_samples_leaf=10, n_estimators=20, max_depth=8, random_state=42, n_jobs=-1)

meta_alg.fit(train, y)

submission['Survived'] = meta_alg.predict(test)

submission.to_csv('predictions/submission_blending.csv', index=False)