# Imports

In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [2]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

# Data Reading

In [3]:
from catboost.datasets import titanic

# Data Reading
df_train, df_test = titanic()
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)

# Split X_train, y_train
target = 'Survived'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Preprocessing

In [4]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing.base import *
from robusta.pipeline import *

nums = ['Age', 'Fare', 'SibSp', 'Parch']
cats = ['Pclass', 'Sex', 'Embarked']

data_prep = FeatureUnion([
        ("numeric", make_pipeline(
            ColumnSelector(nums),
            Imputer(strategy="median"),
            GaussRank(),
            #ColumnRenamer(prefix='gr_'),
        )),
        ("category", make_pipeline(
            ColumnSelector(cats),
            Imputer(strategy="most_frequent"),
            LabelEncoder(),
            #ColumnRenamer(prefix='le_'),
        )),
])

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,Age,Fare,SibSp,Parch,Pclass,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,-0.488867,-0.968076,0.600843,-0.214091,2,1,2
2,0.584412,0.853865,0.600843,-0.214091,0,0,0
3,-0.275357,-0.454031,-0.290348,-0.214091,2,0,2
4,0.470091,0.703973,0.600843,-0.214091,0,0,2
5,0.470091,-0.379298,-0.290348,-0.214091,2,1,2


# Stacking

In [5]:
from robusta.model import get_model

task = 'classifier'

estimators = [
    get_model('XGB', task),
    get_model('LGBM', task),
    get_model('RGF', task),
    get_model('LogReg', task),
    #get_model('Ridge', task),
    get_model('AdaBoost', task, n_estimators=100),
    get_model('SVM', task, probability=True),
    #get_model('ET', task, n_estimators=100, random_state=0),
    #get_model('RF', task, n_estimators=100, random_state=0),
]

## Functional API (fast)

In [6]:
%%time
from sklearn.model_selection import RepeatedStratifiedKFold
from robusta.stacking import *

cv = 5
scoring = ['accuracy', 'neg_log_loss']

S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, scoring=scoring, 
                        method='predict_proba', verbose=1)

[19:17:29] XGB
[19:17:29] accuracy: 0.8194 ± 0.0224
[19:17:29] neg_log_loss: -0.4195 ± 0.0582

[19:17:29] LGBM
[19:17:29] accuracy: 0.8317 ± 0.0338
[19:17:29] neg_log_loss: -0.4751 ± 0.0941

[19:17:30] RGF
[19:17:30] accuracy: 0.8350 ± 0.0240
[19:17:30] neg_log_loss: -0.4156 ± 0.0521

[19:17:30] LogReg
[19:17:31] accuracy: 0.7924 ± 0.0253
[19:17:31] neg_log_loss: -0.4594 ± 0.0255

[19:17:31] AdaBoost
[19:17:32] accuracy: 0.8070 ± 0.0280
[19:17:32] neg_log_loss: -0.6806 ± 0.0025

[19:17:32] SVM
[19:17:32] accuracy: 0.8227 ± 0.0201
[19:17:33] neg_log_loss: -0.4472 ± 0.0315

CPU times: user 2.46 s, sys: 548 ms, total: 3.01 s
Wall time: 4.14 s


In [7]:
S_train.head(10)

Unnamed: 0_level_0,XGB,LGBM,RGF,LogReg,AdaBoost,SVM
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.08102,0.089169,0.098254,0.069357,0.491123,0.134343
2,0.987915,0.995316,0.983956,0.91147,0.509315,0.886563
3,0.444559,0.303481,0.369598,0.575142,0.503675,0.665692
4,0.985142,0.998688,0.992868,0.86588,0.510308,0.920625
5,0.191789,0.053231,0.180799,0.08304,0.496727,0.158923
6,0.141905,0.172941,0.134258,0.132782,0.49262,0.163086
7,0.210415,0.216723,0.319318,0.363259,0.498767,0.16135
8,0.337544,0.434587,0.439988,0.141994,0.487784,0.206172
9,0.457207,0.317327,0.381881,0.524959,0.498756,0.508756
10,0.967069,0.98789,0.956744,0.897028,0.508373,0.905002


## Scikit-Learn API

In [10]:
from sklearn.metrics import get_scorer
from sklearn.metrics import log_loss

blend = get_model('Blend', task, mean='mean', scoring='neg_log_loss', print_progress=5)
blend.fit(S_train, y_train)

print()
for scoring in ['accuracy', 'neg_log_loss']:
    scorer = get_scorer(scoring)
    print('{}: {}'.format(scoring, scorer(blend, S_train, y_train)))

[19:18:08] iters: 1      score: -0.426004      elapsed: 3 ms
[19:18:08] iters: 5      score: -0.426004      elapsed: 118 ms
[19:18:08] iters: 10      score: -0.410779      elapsed: 235 ms
[19:18:08] iters: 15      score: -0.410779      elapsed: 352 ms
[19:18:08] iters: 20      score: -0.409182      elapsed: 466 ms
[19:18:08] iters: 25      score: -0.409182      elapsed: 585 ms
[19:18:08] iters: 30      score: -0.409103      elapsed: 706 ms
[19:18:08] iters: 35      score: -0.408831      elapsed: 825 ms
[19:18:09] iters: 40      score: -0.408831      elapsed: 938 ms
[19:18:09] iters: 45      score: -0.408771      elapsed: 1 sec
[19:18:09] iters: 50      score: -0.408755      elapsed: 1 sec
[19:18:09] iters: 55      score: -0.408755      elapsed: 1 sec
[19:18:09] iters: 60      score: -0.408686      elapsed: 1 sec
[19:18:09] iters: 65      score: -0.408686      elapsed: 1 sec
[19:18:09] iters: 70      score: -0.408649      elapsed: 1 sec
[19:18:09] iters: 75      score: -0.408644      el

In [None]:
%%time
est_names = list(S_train.columns)
named_estimators = list(zip(est_names, estimators))

st1 = Stacker(named_estimators, cv, method='predict')

S1_train = st1.fit(X_train, y_train).transform(X_train)
S1_test = st1.transform(X_test)

## Scikit-Learn API (faster with fit_transform)

In [None]:
%%time
st2 = Stacker(named_estimators, cv, method='predict')

S2_train = st2.fit_transform(X_train, y_train)
S2_test = st2.transform(X_test)

In [None]:
S_train.equals(S1_train), S_train.equals(S2_train)

In [None]:
S_test.equals(S1_test), S_test.equals(S2_test)

## Stacking Strategies

### Probability

In [None]:
estimators = [xgb, lgb, rgf, lr, rf, et, ada]

S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, method='predict_proba')

S_train.head()

### Test Averaging

In [None]:
%%time
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict_proba', test_avg=True)

In [None]:
S_test.head()

In [None]:
%%time
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict_proba', test_avg=False)

In [None]:
S_test.head()

### Voting Strategies

In [None]:
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict', voting='soft')

S_test.head()

In [None]:
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict', voting='hard')

S_test.head()

### Join Original Features

In [None]:
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict_proba', join_X=True)

S_train.head()

# 2nd Level Model

In [None]:
%%time
from robusta.crossval import cross_val_pred, cross_val_score

cv = RepeatedStratifiedKFold(5, 3, random_state=0)
scoring = 'accuracy'

# 1st level models
l1_estimators = [xgb, lgb, rgf, lr, rf, et, ada]
cv_stack = RepeatedStratifiedKFold(5, 1, random_state=666)

S_train, S_test = stack(l1_estimators, cv_stack, X_train, y_train, None, X_test,
                        method='predict')

# 2nd level model
l2_estimator = ridge

scores = cross_val_score(estimator, S_train, y_train, scoring=scoring, cv=cv)
print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

In [None]:
%%time
y_oof, y_sub = cross_val_pred(l2_estimator, cv, S_train, y_train, None, S_test, method='predict')

# Submit

In [None]:
import os

path = os.path.join(os.getcwd(), 'pred')
if not os.path.exists(path):
    os.mkdir(path)

sub_path = os.path.join(path, '1 sub stacking.csv')
oof_path = os.path.join(path, '1 oof stacking.csv')

y_sub.to_csv(sub_path, header=True)
y_oof.to_csv(oof_path, header=True)

### Score:
### `[CV] 0.8324 ± 0.0220`
### `[LB] 0.7799 ↑`