# Imports

In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [2]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

# Data Reading

In [3]:
from catboost.datasets import titanic

# Data Reading
df_train, df_test = titanic()
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)

# Split X_train, y_train
target = 'Survived'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Preprocessing

In [4]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing.base import *
from robusta.pipeline import *

nums = ['Age', 'Fare', 'SibSp', 'Parch']
cats = ['Pclass', 'Sex', 'Embarked']

data_prep = FeatureUnion([
        ("numeric", make_pipeline(
            ColumnSelector(nums),
            Imputer(strategy="median"),
            GaussRank(),
            #ColumnRenamer(prefix='gr_'),
        )),
        ("category", make_pipeline(
            ColumnSelector(cats),
            Imputer(strategy="most_frequent"),
            LabelEncoder(),
            #ColumnRenamer(prefix='le_'),
        )),
])

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,Age,Fare,SibSp,Parch,Pclass,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,-0.488867,-0.968076,0.600843,-0.214091,2,1,2
2,0.584412,0.853865,0.600843,-0.214091,0,0,0
3,-0.275357,-0.454031,-0.290348,-0.214091,2,0,2
4,0.470091,0.703973,0.600843,-0.214091,0,0,2
5,0.470091,-0.379298,-0.290348,-0.214091,2,1,2


# Fold Preparation

In [5]:
from sklearn.model_selection import KFold
from robusta.resampler import *

encoder = FeatureUnion([
    ('category', make_pipeline(
        ColumnSelector(cats),
        TypeConverter('object'),
        TargetEncoderCV(cv=4).set_params(encoder__smoothing=200.0),
    )),
    ('numeric', make_pipeline(
        ColumnSelector(nums),
        Identity(),
    )),
])

resampler = SMOTE(random_state=50, k_neighbors=30)

fold_pipe = make_pipeline(resampler, encoder)

F_train = fold_pipe.fit_transform(X_train, y_train)

F_train.sample(5, random_state=555)

Unnamed: 0,Embarked,Pclass,Sex,Age,Fare,SibSp,Parch
928,0.453032,0.672597,0.78527,0.491782,1.04038,0.600843,-0.214091
873,0.457019,0.354773,0.273246,0.888832,-0.271711,-0.290348,-0.214091
293,0.460378,0.346921,0.800238,-0.381808,-0.274934,-0.290348,-0.214091
760,0.457019,0.354773,0.273246,-0.013926,0.016949,-0.290348,-0.214091
126,0.474911,0.326256,0.252564,-0.013926,-0.770943,-0.290348,-0.214091


# Evaluation

In [6]:
from sklearn.model_selection import RepeatedStratifiedKFold
from robusta.crossval import *

cv = 5

# Model

In [7]:
%%time
from robusta.model import get_model

model = get_model('LGB', 'classifier', n_estimators=100)
model.fit(X_train, y_train)

model = make_pipeline(model)

CPU times: user 448 ms, sys: 17.3 ms, total: 466 ms
Wall time: 72.7 ms


# Submit

In [8]:
%%time
y_oof, y_sub = crossval_predict(model, cv, X_train, y_train, None, X_test, verbose=1, n_jobs=1, 
                                scoring=['roc_auc', 'accuracy', 'neg_log_loss']
                                #scoring='neg_log_loss'
                                #scoring=None
                               )

[20:57:12]  [33m0.8652[0m ± 0.0323 (roc_auc)
[20:57:12]  [33m0.8317[0m ± 0.0338 (accuracy)
[20:57:12] [33m-0.4751[0m ± 0.0941 (neg_log_loss)

CPU times: user 3.83 s, sys: 206 ms, total: 4.04 s
Wall time: 627 ms


In [9]:
%%time
y_oof, y_sub = crossval_predict(model, cv, X_train, y_train, None, X_test, verbose=2, n_jobs=1, 
                                scoring=['roc_auc', 'accuracy', 'neg_log_loss']
                                #scoring='neg_log_loss'
                                #scoring=None
                               )

[20:57:12] LGBMClassifier

[20:57:12] FOLD 0:   0.8404 (roc_auc)   0.7989 (accuracy)  -0.5034 (neg_log_loss)
[20:57:13] FOLD 1:   0.8279 (roc_auc)   0.8212 (accuracy)  -0.6110 (neg_log_loss)
[20:57:13] FOLD 2:   0.8911 (roc_auc)   0.8652 (accuracy)  -0.3935 (neg_log_loss)
[20:57:13] FOLD 3:   0.8624 (roc_auc)   0.8034 (accuracy)  -0.4892 (neg_log_loss)
[20:57:13] FOLD 4:   0.9039 (roc_auc)   0.8701 (accuracy)  -0.3786 (neg_log_loss)

[20:57:13]  [33m0.8652[0m ± 0.0323 (roc_auc)
[20:57:13]  [33m0.8317[0m ± 0.0338 (accuracy)
[20:57:13] [33m-0.4751[0m ± 0.0941 (neg_log_loss)

CPU times: user 5.44 s, sys: 582 ms, total: 6.02 s
Wall time: 1.05 s


In [10]:
%%time
y_oof, y_sub = crossval_predict(model, cv, X_train, y_train, None, X_test, verbose=2, n_jobs=1, 
                                #scoring=['roc_auc', 'accuracy', 'neg_log_loss']
                                #scoring='neg_log_loss'
                                #scoring=None
                               )

[20:57:13] LGBMClassifier

[20:57:13] FOLD 0:   0.8404 (roc_auc)
[20:57:13] FOLD 1:   0.8279 (roc_auc)
[20:57:13] FOLD 2:   0.8911 (roc_auc)
[20:57:13] FOLD 3:   0.8624 (roc_auc)
[20:57:13] FOLD 4:   0.9039 (roc_auc)

[20:57:13]  [33m0.8652[0m ± 0.0323 (roc_auc)

CPU times: user 3.61 s, sys: 194 ms, total: 3.8 s
Wall time: 526 ms


`0.8652 ± 0.0323`