In [1]:
# Common stuff

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (9, 6)

In [2]:
# Get our data and preprocess it

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

import re

test_passenger_id = test['PassengerId']

RARE_TITLES = ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']

TITLES = dict((title, 'Rare') for title in RARE_TITLES)

TITLES['Mlle'] = 'Miss'
TITLES['Ms'] = 'Miss'
TITLES['Mme'] = 'Mrs'

def get_title(name):    
    match = re.search(' ([A-Za-z]+)\.', name)
    if match:        
        return TITLES.get(match.group(1), match.group(1))
    return ''

for dataset in [test, train]:    
    dataset['Name_Length'] = train['Name'].apply(len)    
    dataset['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)    
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1    
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1    
    dataset['Embarked'] = dataset['Embarked'].fillna('S')    
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)    
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())    
    dataset['Age'] = dataset['Age'].fillna(train['Age'].median())    
    dataset['Title'] = dataset['Name'].apply(get_title)
    dataset['Title'] = dataset['Title'].map( {'Mr': 1, 
                                              'Miss': 2, 
                                              'Mrs': 3,
                                              'Master': 4,
                                              'Rare': 5} )    
    dataset['Title'] = dataset['Title'].fillna(0)    
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)    
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)    
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age']
    
DROP_COL = ['PassengerId', 'Name', 'Ticket', 'Cabin']

X_train = train.drop(DROP_COL + ['Survived'], axis=1)
y_train = train['Survived']
X_test = test.drop(DROP_COL, axis=1)

display(X_train.head())
display(X_test.head())

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Name_Length,Has_Cabin,FamilySize,IsAlone,Title
0,3,1,1,1,0,0,0,23,0,2,0,1
1,1,0,2,1,0,3,1,51,1,2,0,3
2,3,0,1,0,0,1,0,22,0,1,1,2
3,1,0,2,1,0,3,0,44,1,2,0,3
4,3,1,2,0,0,1,0,24,0,1,1,1


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Name_Length,Has_Cabin,FamilySize,IsAlone,Title
0,3,1,2,0,0,0,2,23,0,1,1,1
1,3,0,2,1,0,0,0,51,1,2,0,3
2,2,1,3,0,0,1,2,22,0,1,1,1
3,3,1,1,0,0,1,0,44,1,1,1,1
4,3,0,1,1,1,1,0,24,0,3,0,3


In [3]:
# Implement XGBoost classifier creator which trains and
# returns the best classifier

import xgboost

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

def create_xgboost_classifier(X_train, y_train, rs):
    ones_ratio = y_train[y_train == 1].shape[0] * 1.0 / y_train[y_train == 0].shape[0]
    
    param_grid = {    
        'n_estimators': [10, 30, 50, 100, 200, 400, 600, 1000],
        'learning_rate': [0.1],        
        'max_depth': [5],
        'min_child_weight': [2],
        'gamma': [0.1],
        'subsample': [0.8],
        'colsample_bytree': [0.8],
        'scale_pos_weight': [ones_ratio],        
        'reg_alpha': [0.0],
        'reg_lambda': [1.0],
        'random_state' : [rs + 234]
    }

    cv = KFold(n_splits=4, shuffle=True)

    clf = xgboost.XGBClassifier()
    gs = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)

    gs.fit(X_train, y_train)
    best_params = gs.best_estimator_.get_params()

    param_grid = {
        'max_depth': range(3, 10, 2),
        'min_child_weight': range(1, 6, 2)
    }

    clf = xgboost.XGBClassifier(**best_params)

    gs = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)

    gs.fit(X_train, y_train)
    best_params = gs.best_estimator_.get_params()

    param_grid = {
        'gamma': [0.1*i for i in range(6)]
    }

    clf = xgboost.XGBClassifier(**best_params)

    gs = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)

    gs.fit(X_train, y_train)
    best_params = gs.best_estimator_.get_params()

    param_grid = {
        'subsample': [0.5 + 0.1*i for i in range(6)],
        'colsample_bytree': [0.5 + 0.1*i for i in range(6)]
    }

    clf = xgboost.XGBClassifier(**best_params)

    gs = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)

    gs.fit(X_train, y_train)
    best_params = gs.best_estimator_.get_params()

    param_grid = {
        'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100],
        'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100]
    }

    clf = xgboost.XGBClassifier(**best_params)

    gs = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)

    gs.fit(X_train, y_train)
    best_params = gs.best_estimator_.get_params()

    clf = xgboost.XGBClassifier(**best_params)
    best_n_estimators = clf.get_params()['n_estimators']
    best_learning_rate = best_params['learning_rate']
    invariant_composition = best_n_estimators * best_learning_rate
    n_estimators_range = [10, 30, 100, 200, 400, 600, 800, 1000]

    best_score = gs.best_score_

    for n_estimators in n_estimators_range:
        learning_rate = invariant_composition / n_estimators
        clf.set_params(n_estimators=n_estimators, learning_rate=learning_rate)
        scores = []
        for train_idx, test_idx in cv.split(X_train):
            X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
            y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
            clf.fit(X_train_fold, y_train_fold)
            preds = clf.predict_proba(X_test_fold)
            score = roc_auc_score(y_test_fold, preds[:,1])
            scores.append(score)
        score = np.mean(score)
        if score > best_score:
            best_n_estimators = n_estimators
            best_learning_rate = learning_rate
            best_score = score

    best_params['n_estimators'] = best_n_estimators
    best_params['learning_rate'] = best_learning_rate
    
    return xgboost.XGBClassifier(**best_params)

In [4]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

def cross_val_predict_proba(estimator, X_train, y_train, X_test, random_state=None, n_splits=5):    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    
    y_test = np.zeros((len(X_test), n_splits), np.float32)
    
    kfold = KFold(n_splits=n_splits, 
                  shuffle=True,
                  random_state=random_state)

    y_predict = np.zeros_like(y_train, np.float32)
    for i, (train_idx, test_idx) in enumerate(kfold.split(y_train)):
        estimator.fit(X_train[train_idx], y_train[train_idx])
        y_predict[test_idx] = estimator.predict_proba(X_train[test_idx])[:, 1]
        y_test[:, i] = estimator.predict_proba(X_test)[:, 1]
    
    return y_predict, np.mean(y_test, axis=1)

# Make an ensemble of XGBoosts

estimators = [create_xgboost_classifier(X_train, y_train, 98),
             create_xgboost_classifier(X_train, y_train, 981),
             create_xgboost_classifier(X_train, y_train, 122981),
             create_xgboost_classifier(X_train, y_train, 5466),
             create_xgboost_classifier(X_train, y_train, 546688),
             create_xgboost_classifier(X_train, y_train, 111223),              
             create_xgboost_classifier(X_train, y_train, 5981),
             create_xgboost_classifier(X_train, y_train, 5122981),
             create_xgboost_classifier(X_train, y_train, 55466),
             create_xgboost_classifier(X_train, y_train, 6546688),
             create_xgboost_classifier(X_train, y_train, 7111223),
             create_xgboost_classifier(X_train, y_train, 95122981),
             create_xgboost_classifier(X_train, y_train, 955466),
             create_xgboost_classifier(X_train, y_train, 86546688),
             create_xgboost_classifier(X_train, y_train, 97111223)]

predicted = [cross_val_predict_proba(est, X_train, y_train, X_test) for est in estimators]

X_train_stack = np.stack([p[0] for p in predicted], axis=1)
X_test_stack = np.stack([p[1] for p in predicted], axis=1)

In [5]:
# Use stacking, clf is an XGBoost classifier that trains on predictions
# of other classifiers

clf = create_xgboost_classifier(pd.DataFrame(X_train_stack), y_train, 30987)
clf.fit(X_train_stack, y_train)
preds = clf.predict_proba(X_train_stack)

# Print stacking predictor score

print(roc_auc_score(y_train, preds[:, 1]))

0.972784115723


In [6]:
# Use stacking predictor to create a submission

predicted = clf.predict(X_test_stack)
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test_passenger_id, predicted):
        out.write('%s,%s\n' % (passenger, y))