## Imports

In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [2]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

## Data Reading

In [3]:
from catboost.datasets import titanic

# Data Reading
df_train, df_test = titanic()
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)

# Split X_train, y_train
target = 'Survived'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preprocessing

In [4]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing.base import *
from robusta.pipeline import *

nums = ['Age', 'Fare', 'SibSp', 'Parch']
cats = ['Pclass', 'Sex', 'Embarked']

data_prep = FeatureUnion([
        ("numeric", make_pipeline(
            ColumnSelector(nums),
            Imputer(strategy="median"),
            GaussRank(),
            ColumnRenamer(prefix='gr_'),
        )),
        ("category", make_pipeline(
            ColumnSelector(cats),
            Imputer(strategy="most_frequent"),
            LabelEncoder(dtype='category'),
            ColumnRenamer(prefix='le_'),
        )),
])

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,gr_Age,gr_Fare,gr_SibSp,gr_Parch,le_Pclass,le_Sex,le_Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,-0.488867,-0.968076,0.600843,-0.214091,2,1,2
2,0.584412,0.853865,0.600843,-0.214091,0,0,0
3,-0.275357,-0.454031,-0.290348,-0.214091,2,0,2
4,0.470091,0.703973,0.600843,-0.214091,0,0,2
5,0.470091,-0.379298,-0.290348,-0.214091,2,1,2


## Fold Preparation

In [411]:
from sklearn.model_selection import KFold
from robusta.resampler import *

encoder = FeatureUnion([
    ('category', make_pipeline(
        TypeSelector(['category', 'object']), 
        TargetEncoderCV(cv=4).set_params(encoder__smoothing=200.0),
    )),
    ('numeric', make_pipeline(
        TypeSelector(np.number),
        Identity(),
    )),
])

resampler = SMOTE(random_state=100, k_neighbors=200)

fold_pipe = make_pipeline(resampler, encoder)

F_train = fold_pipe.fit_transform(X_train, y_train)

F_train.sample(5, random_state=555)

Unnamed: 0,le_Embarked,le_Pclass,le_Sex,gr_Age,gr_Fare,gr_SibSp,gr_Parch
928,0.5,0.365267,0.5,0.035459,-0.69624,-0.290348,-0.214091
873,0.442888,0.365267,0.258366,0.888832,-0.271711,-0.290348,-0.214091
293,0.442888,0.365267,0.740852,-0.381808,-0.274934,-0.290348,-0.214091
760,0.442888,0.365267,0.258366,-0.013926,0.016949,-0.290348,-0.214091
126,0.463396,0.365267,0.258366,-0.013926,-0.770943,-0.290348,-0.214091


## Model

In [425]:
%%time
from lightgbm import LGBMClassifier

model = LGBMClassifier()

estimator = make_pipeline(encoder, resampler, estimator)
estimator.fit(X_train, y_train)

y_test = estimator.predict(X_test)

CPU times: user 2.03 s, sys: 423 ms, total: 2.45 s
Wall time: 860 ms


In [424]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import *

cv = KFold(5)
scoring = 'accuracy'

estimator = make_pipeline(model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=scoring)
print('{:.4f} ± {:.4f} [model]'.format(np.mean(scores), np.std(scores)))

estimator = make_pipeline(resampler, model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=scoring)
print('{:.4f} ± {:.4f} [encoder + model]'.format(np.mean(scores), np.std(scores)))

estimator = make_pipeline(encoder, model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=scoring)
print('{:.4f} ± {:.4f} [resampler + model]'.format(np.mean(scores), np.std(scores)))

estimator = make_pipeline(resampler, encoder, model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=scoring)
print('{:.4f} ± {:.4f} [resampler + encoder + model]'.format(np.mean(scores), np.std(scores)))

estimator = make_pipeline(encoder, resampler, model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=scoring)
print('{:.4f} ± {:.4f} [encoder + resampler + model]'.format(np.mean(scores), np.std(scores)))

# 0.8216 ± 0.0370 (estimator)
# 0.8249 ± 0.0320 (encoder + estimator)
# 0.8261 ± 0.0336 (encoder + estimator)

0.8216 ± 0.0370 [estimator]
0.8249 ± 0.0314 [encoder + estimator]
0.8193 ± 0.0276 [resampler + estimator]
0.8216 ± 0.0281 [resampler + encoder + estimator]
0.7890 ± 0.0271 [encoder + resampler + estimator]


In [426]:
estimator = make_pipeline(encoder, model)

## Submit (Out-of-Fold & Test Predictions)

In [427]:
%%time
from sklearn.model_selection import cross_val_predict

y_oof = cross_val_predict(estimator, X_train, y_train, cv=cv)

y_oof = pd.Series(y_oof, index=X_train.index, name=target)
print(y_oof.head())

PassengerId
1    0
2    1
3    0
4    1
5    0
Name: Survived, dtype: int64
CPU times: user 13.9 s, sys: 2.45 s, total: 16.4 s
Wall time: 4.6 s


In [428]:
%%time
from sklearn.model_selection import cross_validate

scores = cross_validate(estimator, X_train, y_train, cv=cv, return_estimator=True)
y_subs = [estimator.predict_proba(X_test)[:,1] for estimator in scores['estimator']]
y_sub = np.rint(np.mean(y_subs, axis=0)).astype(int)

y_sub = pd.Series(y_sub, index=X_test.index, name=target)
print(y_sub.head())

PassengerId
892    0
893    0
894    0
895    0
896    0
Name: Survived, dtype: int64
CPU times: user 27.2 s, sys: 4.76 s, total: 31.9 s
Wall time: 7.79 s


In [429]:
import os

path = os.path.join(os.getcwd(), 'pred')
if not os.path.exists(path):
    os.mkdir(path)

sub_path = os.path.join(path, '0 sub baseline.csv')
oof_path = os.path.join(path, '0 oof baseline.csv')

y_sub.to_csv(sub_path, header=True)
y_oof.to_csv(oof_path, header=True)

### Score:
### `[CV] 0.8249 ± 0.0314`
### `[LB] 0.7656`