## Imports

In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [2]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

## Data Reading

In [3]:
from catboost.datasets import titanic

# Data Reading
df_train, df_test = titanic()
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)

# Split X_train, y_train
target = 'Survived'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preprocessing

In [4]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing.base import *
from robusta.pipeline import *

nums = ['Age', 'Fare', 'SibSp', 'Parch']
cats = ['Pclass', 'Sex', 'Embarked']

data_prep = FeatureUnion([
        ("numeric", make_pipeline(
            ColumnSelector(nums),
            Imputer(strategy="median"),
            GaussRank(),
            #ColumnRenamer(prefix='gr_'),
        )),
        ("category", make_pipeline(
            ColumnSelector(cats),
            Imputer(strategy="most_frequent"),
            LabelEncoder(),
            #ColumnRenamer(prefix='le_'),
        )),
])

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,Age,Fare,SibSp,Parch,Pclass,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,-0.488867,-0.968076,0.600843,-0.214091,2,1,2
2,0.584412,0.853865,0.600843,-0.214091,0,0,0
3,-0.275357,-0.454031,-0.290348,-0.214091,2,0,2
4,0.470091,0.703973,0.600843,-0.214091,0,0,2
5,0.470091,-0.379298,-0.290348,-0.214091,2,1,2


## Fold Preparation

In [5]:
from sklearn.model_selection import KFold
from robusta.resampler import *

encoder = FeatureUnion([
    ('category', make_pipeline(
        ColumnSelector(cats),
        TypeConverter('object'),
        TargetEncoderCV(cv=4).set_params(encoder__smoothing=200.0),
    )),
    ('numeric', make_pipeline(
        ColumnSelector(nums),
        Identity(),
    )),
])

resampler = SMOTE(random_state=50, k_neighbors=30)

fold_pipe = make_pipeline(resampler, encoder)

F_train = fold_pipe.fit_transform(X_train, y_train)

F_train.sample(5, random_state=555)

Unnamed: 0,Embarked,Pclass,Sex,Age,Fare,SibSp,Parch
928,0.453032,0.672597,0.78527,0.491782,1.04038,0.600843,-0.214091
873,0.457019,0.354773,0.273246,0.888832,-0.271711,-0.290348,-0.214091
293,0.460378,0.346921,0.800238,-0.381808,-0.274934,-0.290348,-0.214091
760,0.457019,0.354773,0.273246,-0.013926,0.016949,-0.290348,-0.214091
126,0.474911,0.326256,0.252564,-0.013926,-0.770943,-0.290348,-0.214091


## Model

In [6]:
%%time
from lightgbm import LGBMClassifier

model = LGBMClassifier()
model.fit(X_train, y_train)

CPU times: user 498 ms, sys: 33.8 ms, total: 532 ms
Wall time: 91 ms


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [7]:
from sklearn.model_selection import cross_val_score

cv = 5
scoring = 'accuracy'

estimator = make_pipeline(model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=scoring)
print('{:.4f} ± {:.4f} [model]'.format(np.mean(scores), np.std(scores)))

estimator = make_pipeline(encoder, model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=scoring)
print('{:.4f} ± {:.4f} [resampler + model]'.format(np.mean(scores), np.std(scores)))

estimator = make_pipeline(resampler, model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=scoring)
print('{:.4f} ± {:.4f} [encoder + model]'.format(np.mean(scores), np.std(scores)))

# 0.8216 ± 0.0370 (model)
# 0.8261 ± 0.0336 (resampler + model)
# 0.8249 ± 0.0320 (encoder + model)

0.8317 ± 0.0303 [model]
0.8306 ± 0.0212 [resampler + model]
0.8194 ± 0.0261 [encoder + model]


In [8]:
estimator = make_pipeline(encoder, model)

## Submit (Out-of-Fold & Test Predictions)

In [9]:
%%time
from robusta.crossval import cross_val_pred

y_oof, y_sub = cross_val_pred(estimator, cv, X_train, y_train, None, X_test, test_avg=False)

CPU times: user 2.48 s, sys: 438 ms, total: 2.91 s
Wall time: 4.08 s


In [10]:
import os

path = os.path.join(os.getcwd(), 'pred')
if not os.path.exists(path):
    os.mkdir(path)

sub_path = os.path.join(path, '0 sub baseline.csv')
oof_path = os.path.join(path, '0 oof baseline.csv')

y_sub.to_csv(sub_path, header=True)
y_oof.to_csv(oof_path, header=True)

### Score:
### `[CV] 0.8249 ± 0.0314`
### `[LB] 0.7656`

# Stacking

In [11]:
from xgboost import XGBClassifier, XGBRanker
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from rgf import RGFClassifier, RGFRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

xgb = XGBClassifier()
lgb = LGBMClassifier()
rgf = RGFClassifier()
lr = LogisticRegression()
ridge = RidgeClassifier()
ada = AdaBoostClassifier()
svc = SVC()
et = ExtraTreesClassifier(100, random_state=0)
rf = RandomForestClassifier(100, random_state=0)

#estimators = [xgb, lgb, rgf, estimator, lr, rf, ridge, svc, et]
estimators = [xgb, lgb, rgf, estimator, lr, rf, et]

In [16]:
%%time
from sklearn.model_selection import RepeatedStratifiedKFold
from robusta.stacking import stack, Stacker

cv = RepeatedStratifiedKFold(5, 3, random_state=0)
#cv = KFold(4)
#cv = 2

y_oof, y_sub = stack(estimators, cv, X_train, y_train, None, X_test, test_avg=False, 
                     method='predict_proba', voting='auto', n_jobs=-1)

CPU times: user 3.79 s, sys: 2.07 s, total: 5.85 s
Wall time: 5.21 s


In [17]:
%%time
named_estimators = list(zip(y_oof.columns, estimators))

st1 = Stacker(named_estimators, cv, method='predict_proba', voting='auto', 
              test_avg=False, join_X=True)

S_train = st1.fit_transform(X_train, y_train)
S_test = st1.transform(X_test)

CPU times: user 6.62 s, sys: 3.42 s, total: 10 s
Wall time: 6.82 s


In [18]:
%%time
st2 = Stacker(named_estimators, cv, method='predict_proba', voting='auto', 
              test_avg=False, join_X=True)

s_train = st2.fit(X_train, y_train).transform(X_train)
s_test = st2.transform(X_test)

CPU times: user 12.5 s, sys: 10.3 s, total: 22.8 s
Wall time: 10.5 s


In [19]:
S_train.equals(s_train), S_train.equals(y_oof), S_test.equals(s_test), S_test.equals(y_sub)

(True, False, True, False)

In [20]:
(S_train != s_train).sum()

Age                   0
Fare                  0
SibSp                 0
Parch                 0
Pclass                0
Sex                   0
Embarked              0
XGB                   0
LGBM                  0
RGF                   0
LGBM                  0
LogisticRegression    0
RandomForest          0
ExtraTrees            0
dtype: int64

In [21]:
S_train.head(10)

Unnamed: 0_level_0,Age,Fare,SibSp,Parch,Pclass,Sex,Embarked,XGB,LGBM,RGF,LGBM,LogisticRegression,RandomForest,ExtraTrees
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,-0.488867,-0.968076,0.600843,-0.214091,2,1,2,0.074606,0.074469,0.100775,0.04938,0.070499,0.116667,0.24
2,0.584412,0.853865,0.600843,-0.214091,0,0,0,0.968788,0.997527,0.980779,0.993589,0.881958,0.98,1.0
3,-0.275357,-0.454031,-0.290348,-0.214091,2,0,2,0.60566,0.37048,0.545062,0.236752,0.632198,0.5,0.603333
4,0.470091,0.703973,0.600843,-0.214091,0,0,2,0.983498,0.999444,0.992846,0.998213,0.856154,0.993333,0.996667
5,0.470091,-0.379298,-0.290348,-0.214091,2,1,2,0.124782,0.048336,0.124613,0.059352,0.087369,0.013333,0.0
6,-0.013926,-0.314077,-0.290348,-0.214091,2,1,1,0.15194,0.171703,0.197679,0.156891,0.13409,0.12583,0.075185
7,1.155838,0.661086,-0.290348,-0.214091,0,1,2,0.142181,0.015554,0.106792,0.008695,0.33418,0.123333,0.196667
8,-1.425723,0.175234,1.246354,0.668357,2,1,2,0.574851,0.263423,0.49466,0.460988,0.123309,0.436667,0.266667
9,-0.237116,-0.159855,-0.290348,1.092514,2,0,2,0.438893,0.595554,0.548609,0.680116,0.584233,0.696667,0.79
10,-0.976632,0.454031,0.600843,-0.214091,1,0,0,0.952564,0.971436,0.930828,0.979836,0.882394,0.923333,0.93


In [22]:
S_test.head(10)

Unnamed: 0_level_0,Age,Fare,SibSp,Parch,Pclass,Sex,Embarked,XGB,LGBM,RGF,LGBM,LogisticRegression,RandomForest,ExtraTrees
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
892,0.448051,-0.574816,-0.292952,-0.200294,2,1,1,0.06803,0.008663,0.054003,0.011372,0.092539,0.02,0.0
893,0.867106,-1.447363,0.622205,-0.200294,2,0,2,0.15244,0.06851,0.23635,0.100582,0.264049,0.17,0.19
894,1.503398,-0.233133,-0.292952,-0.200294,1,1,1,0.074515,0.118151,0.088207,0.189492,0.127952,0.31,0.13
895,-0.204219,-0.289902,-0.292952,-0.200294,2,1,2,0.185541,0.533763,0.165103,0.356903,0.123053,0.65,0.78
896,-0.531624,-0.158202,0.622205,0.701239,2,0,2,0.416402,0.333417,0.488737,0.265478,0.544121,0.51,0.44
897,-1.040528,-0.260172,-0.292952,-0.200294,2,1,2,0.43374,0.565023,0.386002,0.203383,0.185263,0.346667,0.13
898,0.306122,-0.904269,-0.292952,-0.200294,2,0,1,0.589998,0.120653,0.39647,0.649165,0.53752,0.198,0.0
899,-0.262442,0.432608,0.622205,0.701239,1,1,2,0.155922,0.18468,0.156049,0.128564,0.196209,0.13,0.1
900,-0.849392,-1.100528,-0.292952,-0.200294,2,0,0,0.767893,0.908908,0.789177,0.81592,0.708097,0.9,0.86
901,-0.629668,0.242106,1.231674,-0.200294,2,1,2,0.040767,0.01103,0.013362,0.004065,0.101231,0.07,0.01


In [25]:
y_sub.merge(s_test)

MergeError: Data columns not unique: Index(['XGB', 'LGBM', 'LGBM', 'RGF', 'LogisticRegression', 'RandomForest',
       'ExtraTrees'],
      dtype='object')