# Imports

In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [2]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

# Data Reading

In [3]:
from catboost.datasets import titanic

# Data Reading
df_train, df_test = titanic()
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)

# Split X_train, y_train
target = 'Survived'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Preprocessing

In [4]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing.base import *
from robusta.pipeline import *

nums = ['Age', 'Fare', 'SibSp', 'Parch']
cats = ['Pclass', 'Sex', 'Embarked']

data_prep = FeatureUnion([
        ("numeric", make_pipeline(
            ColumnSelector(nums),
            Imputer(strategy="median"),
            GaussRank(),
            #ColumnRenamer(prefix='gr_'),
        )),
        ("category", make_pipeline(
            ColumnSelector(cats),
            Imputer(strategy="most_frequent"),
            LabelEncoder(),
            #ColumnRenamer(prefix='le_'),
        )),
])

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,Age,Fare,SibSp,Parch,Pclass,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,-0.488867,-0.968076,0.600843,-0.214091,2,1,2
2,0.584412,0.853865,0.600843,-0.214091,0,0,0
3,-0.275357,-0.454031,-0.290348,-0.214091,2,0,2
4,0.470091,0.703973,0.600843,-0.214091,0,0,2
5,0.470091,-0.379298,-0.290348,-0.214091,2,1,2


# Stacking

In [5]:
from xgboost import XGBClassifier, XGBRanker
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from rgf import RGFClassifier, RGFRegressor
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC

xgb = XGBClassifier()
lgb = LGBMClassifier()
rgf = RGFClassifier()
lr = LogisticRegression()
ridge = RidgeClassifier()
ada = AdaBoostClassifier()
svc = SVC()
et = ExtraTreesClassifier(100, random_state=0)
rf = RandomForestClassifier(100, random_state=0)

estimators = [xgb, lgb, rgf, lr, rf, et, ada, svc, ridge]

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Functional API (fast)

In [6]:
%%time
from sklearn.model_selection import RepeatedStratifiedKFold
from robusta.stacking import *

cv = 5

S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, method='predict')

AttributeError: module 'robusta.stacking' has no attribute 'make_stacker'

In [7]:
S_train.head(10)

NameError: name 'S_train' is not defined

## Scikit-Learn API

In [None]:
%%time
est_names = list(S_train.columns)
named_estimators = list(zip(est_names, estimators))

st1 = Stacker(named_estimators, cv, method='predict')

S1_train = st1.fit(X_train, y_train).transform(X_train)
S1_test = st1.transform(X_test)

## Scikit-Learn API (faster with fit_transform)

In [None]:
%%time
st2 = Stacker(named_estimators, cv, method='predict')

S2_train = st2.fit_transform(X_train, y_train)
S2_test = st2.transform(X_test)

In [None]:
S_train.equals(S1_train), S_train.equals(S2_train)

In [None]:
S_test.equals(S1_test), S_test.equals(S2_test)

## Stacking Strategies

### Probability

In [None]:
estimators = [xgb, lgb, rgf, lr, rf, et, ada]

S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, method='predict_proba')

S_train.head()

### Test Averaging

In [None]:
%%time
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict_proba', test_avg=True)

In [None]:
S_test.head()

In [None]:
%%time
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict_proba', test_avg=False)

In [None]:
S_test.head()

### Voting Strategies

In [None]:
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict', voting='soft')

S_test.head()

In [None]:
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict', voting='hard')

S_test.head()

### Join Original Features

In [None]:
S_train, S_test = stack(estimators, cv, X_train, y_train, None, X_test, 
                        method='predict_proba', join_X=True)

S_train.head()

# 2nd Level Model

In [None]:
%%time
from robusta.crossval import cross_val_pred, cross_val_score

cv = RepeatedStratifiedKFold(5, 3, random_state=0)
scoring = 'accuracy'

# 1st level models
l1_estimators = [xgb, lgb, rgf, lr, rf, et, ada]
cv_stack = RepeatedStratifiedKFold(4, 10, random_state=666)

S_train, S_test = make_stacker(l1_estimators, cv_stack, X_train, y_train, None, X_test, 
                               method='predict')

# 2nd level model
estimator = ridge

scores = cross_val_score(estimator, S_train, y_train, scoring=scoring, cv=cv)
print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

In [None]:


st = Stacker(l1_estimators)
estimator = make_pipeline()

scores = cross_val_score(estimator, S_train, y_train, scoring=scoring, cv=cv)
print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

In [None]:
%%time
y_oof, y_sub = cross_val_pred(estimator, cv, S_train, y_train, None, S_test, method='predict')

# Submit

In [None]:
import os

path = os.path.join(os.getcwd(), 'pred')
if not os.path.exists(path):
    os.mkdir(path)

sub_path = os.path.join(path, '1 sub stacking.csv')
oof_path = os.path.join(path, '1 oof stacking.csv')

y_sub.to_csv(sub_path, header=True)
y_oof.to_csv(oof_path, header=True)

### Score:
### `[CV] 0.8298 ± 0.0222`
### `[LB] 0.7799 ↑`