# Imports

In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [2]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

# Data Reading

In [3]:
from catboost.datasets import titanic

# Data Reading
df_train, df_test = titanic()
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)

# Split X_train, y_train
target = 'Survived'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Preprocessing

In [4]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing.base import *
from robusta.pipeline import *

nums = ['Age', 'Fare', 'SibSp', 'Parch']
cats = ['Pclass', 'Sex', 'Embarked']

data_prep = FeatureUnion([
        ("numeric", make_pipeline(
            ColumnSelector(nums),
            Imputer(strategy="median"),
            GaussRank(),
            #ColumnRenamer(prefix='gr_'),
        )),
        ("category", make_pipeline(
            ColumnSelector(cats),
            Imputer(strategy="most_frequent"),
            LabelEncoder(),
            #ColumnRenamer(prefix='le_'),
        )),
])

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,Age,Fare,SibSp,Parch,Pclass,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,-0.488867,-0.968076,0.600843,-0.214091,2,1,2
2,0.584412,0.853865,0.600843,-0.214091,0,0,0
3,-0.275357,-0.454031,-0.290348,-0.214091,2,0,2
4,0.470091,0.703973,0.600843,-0.214091,0,0,2
5,0.470091,-0.379298,-0.290348,-0.214091,2,1,2


# Stacking

In [5]:
from robusta.model import get_model

task = 'classifier'

estimators = [
    get_model('XGB', task),
    get_model('LGBM', task),
    get_model('RGF', task),
    get_model('LogReg', task),
    get_model('Ridge', task),
    get_model('AdaBoost', task),
    get_model('SVM', task),
    get_model('ET', task),
    get_model('RF', task),
]
et = ExtraTreesClassifier(100, random_state=0)
rf = RandomForestClassifier(100, random_state=0)

estimators = [xgb, lgb, rgf, lr, rf, et, ada, svc, ridge]

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Functional API (fast)

In [6]:
%%time
from sklearn.model_selection import RepeatedStratifiedKFold
from robusta.stacking import *

cv = 5

S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, method='predict')

CPU times: user 2.88 s, sys: 109 ms, total: 2.99 s
Wall time: 9.2 s


In [7]:
S_train.head(10)

Unnamed: 0_level_0,XGB,LGBM,RGF,LogisticRegression,RandomForest,ExtraTrees,AdaBoost,SVC,Ridge
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1
3,0,0,0,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1
5,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0
9,0,0,0,1,1,1,0,0,1
10,1,1,1,1,1,1,1,1,1


## Scikit-Learn API

In [8]:
%%time
est_names = list(S_train.columns)
named_estimators = list(zip(est_names, estimators))

st1 = Stacker(named_estimators, cv, method='predict')

S1_train = st1.fit(X_train, y_train).transform(X_train)
S1_test = st1.transform(X_test)

CPU times: user 6.05 s, sys: 1.69 s, total: 7.74 s
Wall time: 5.95 s


## Scikit-Learn API (faster with fit_transform)

In [9]:
%%time
st2 = Stacker(named_estimators, cv, method='predict')

S2_train = st2.fit_transform(X_train, y_train)
S2_test = st2.transform(X_test)

CPU times: user 4.44 s, sys: 1.15 s, total: 5.59 s
Wall time: 5.01 s


In [10]:
S_train.equals(S1_train), S_train.equals(S2_train)

(False, False)

In [11]:
S_test.equals(S1_test), S_test.equals(S2_test)

(False, False)

## Stacking Strategies

### Probability

In [12]:
estimators = [xgb, lgb, rgf, lr, rf, et, ada]

S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, method='predict_proba')

S_train.head()

Unnamed: 0_level_0,XGB,LGBM,RGF,LogisticRegression,RandomForest,ExtraTrees,AdaBoost
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.08102,0.089169,0.098254,0.069357,0.258333,0.27,0.48378
2,0.987915,0.995316,0.983956,0.91147,0.98,1.0,0.519433
3,0.444559,0.303481,0.369598,0.575142,0.69,0.8,0.506198
4,0.985142,0.998688,0.992868,0.86588,1.0,1.0,0.518904
5,0.191789,0.053231,0.180799,0.08304,0.0,0.0,0.492302


### Test Averaging

In [13]:
%%time
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict_proba', test_avg=True)

CPU times: user 204 ms, sys: 9.39 ms, total: 213 ms
Wall time: 1.22 s


In [14]:
S_test.head()

Unnamed: 0_level_0,XGB,LGBM,RGF,LogisticRegression,RandomForest,ExtraTrees,AdaBoost
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,0.097115,0.020581,0.077787,0.093394,0.085294,0.07,0.49118
893,0.211838,0.066004,0.202795,0.260892,0.222,0.304,0.459707
894,0.073048,0.107867,0.0865,0.128697,0.252,0.134,0.518382
895,0.140056,0.448253,0.174646,0.125159,0.496,0.622,0.486954
896,0.410389,0.375372,0.352452,0.543252,0.466,0.424,0.500131


In [15]:
%%time
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict_proba', test_avg=False)

CPU times: user 1.92 s, sys: 430 ms, total: 2.35 s
Wall time: 2.06 s


In [16]:
S_test.head()

Unnamed: 0_level_0,XGB,LGBM,RGF,LogisticRegression,RandomForest,ExtraTrees,AdaBoost
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,0.06803,0.008663,0.054003,0.092539,0.02,0.0,0.491967
893,0.15244,0.06851,0.23635,0.264049,0.17,0.19,0.490154
894,0.074515,0.118151,0.088207,0.127952,0.31,0.13,0.53298
895,0.185541,0.533763,0.165103,0.123053,0.65,0.78,0.486288
896,0.416402,0.333417,0.488737,0.544121,0.51,0.44,0.499855


### Voting Strategies

In [17]:
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict', voting='soft')

S_test.head()

Unnamed: 0_level_0,XGB,LGBM,RGF,LogisticRegression,RandomForest,ExtraTrees,AdaBoost
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,0,0,0,0,0,0,0
893,0,0,0,0,0,0,0
894,0,0,0,0,0,0,1
895,0,0,0,0,0,1,0
896,0,0,0,1,0,0,1


In [18]:
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict', voting='hard')

S_test.head()

Unnamed: 0_level_0,XGB,LGBM,RGF,LogisticRegression,RandomForest,ExtraTrees,AdaBoost
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,0,0,0,0,0,0,0
893,0,0,0,0,0,0,0
894,0,0,0,0,0,0,0
895,0,1,0,0,1,1,0
896,0,0,0,1,0,0,0


### Join Original Features

In [19]:
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict_proba', join_X=True)

S_train.head()

Unnamed: 0_level_0,Age,Fare,SibSp,Parch,Pclass,Sex,Embarked,XGB,LGBM,RGF,LogisticRegression,RandomForest,ExtraTrees,AdaBoost
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,-0.488867,-0.968076,0.600843,-0.214091,2,1,2,0.08102,0.089169,0.098254,0.069357,0.258333,0.27,0.48378
2,0.584412,0.853865,0.600843,-0.214091,0,0,0,0.987915,0.995316,0.983956,0.91147,0.98,1.0,0.519433
3,-0.275357,-0.454031,-0.290348,-0.214091,2,0,2,0.444559,0.303481,0.369598,0.575142,0.69,0.8,0.506198
4,0.470091,0.703973,0.600843,-0.214091,0,0,2,0.985142,0.998688,0.992868,0.86588,1.0,1.0,0.518904
5,0.470091,-0.379298,-0.290348,-0.214091,2,1,2,0.191789,0.053231,0.180799,0.08304,0.0,0.0,0.492302


# 2nd Level Model

In [31]:
%%time
from robusta.crossval import cross_val_pred, cross_val_score

cv = RepeatedStratifiedKFold(5, 3, random_state=0)
scoring = 'accuracy'

# 1st level models
l1_estimators = [xgb, lgb, rgf, lr, rf, et, ada]
cv_stack = RepeatedStratifiedKFold(5, 1, random_state=666)

S_train, S_test = stack(l1_estimators, cv_stack, X_train, y_train, None, X_test,
                        method='predict')

# 2nd level model
l2_estimator = ridge

scores = cross_val_score(estimator, S_train, y_train, scoring=scoring, cv=cv)
print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

0.8324 ± 0.0220
CPU times: user 306 ms, sys: 15.3 ms, total: 322 ms
Wall time: 1.34 s


In [32]:
%%time
y_oof, y_sub = cross_val_pred(l2_estimator, cv, S_train, y_train, None, S_test, method='predict')

CPU times: user 1.33 s, sys: 16.8 ms, total: 1.35 s
Wall time: 1.36 s


# Submit

In [33]:
import os

path = os.path.join(os.getcwd(), 'pred')
if not os.path.exists(path):
    os.mkdir(path)

sub_path = os.path.join(path, '1 sub stacking.csv')
oof_path = os.path.join(path, '1 oof stacking.csv')

y_sub.to_csv(sub_path, header=True)
y_oof.to_csv(oof_path, header=True)

### Score:
### `[CV] 0.8324 ± 0.0220`
### `[LB] 0.7799 ↑`