## Imports

In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [2]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

## Data Reading

In [3]:
from catboost.datasets import titanic

# Data Reading
df_train, df_test = titanic()
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)

# Split X_train, y_train
target = 'Survived'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Preparation

In [4]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing import *
from robusta.compose import *

nums = ['Age', 'Fare', 'SibSp', 'Parch']
cats = ['Pclass', 'Sex', 'Embarked']

data_prep = FeatureUnion([
        ("numeric", make_pipeline(
            ColumnSelector(nums),
            Imputer(strategy="median"),
            GaussRank(),
            ColumnRenamer(prefix='gr_'),
        )),
        ("category", make_pipeline(
            ColumnSelector(cats),
            LabelEncoder(dtype='category'),
            ColumnRenamer(prefix='le_'),
        )),
])

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,gr_Age,gr_Fare,gr_SibSp,gr_Parch,le_Pclass,le_Sex,le_Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,-0.488867,-0.968076,0.600843,-0.214091,2,1,2
2,0.584412,0.853865,0.600843,-0.214091,0,0,0
3,-0.275357,-0.454031,-0.290348,-0.214091,2,0,2
4,0.470091,0.703973,0.600843,-0.214091,0,0,2
5,0.470091,-0.379298,-0.290348,-0.214091,2,1,2


## Fold Preparation

In [5]:
fold_prep = FeatureUnion([
    ('category', make_pipeline(
        TypeSelector(['category', 'object']), 
        TargetEncoderCV(cv=4).set_params(encoder__smoothing=2.0),
    )),
    ('numeric', make_pipeline(
        TypeSelector(np.number),
        Identity(),
    )),
])
    
F_train = fold_prep.fit_transform(X_train, y_train)

F_train.sample(5, random_state=555)

Unnamed: 0_level_0,le_Embarked,le_Pclass,le_Sex,gr_Age,gr_Fare,gr_SibSp,gr_Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
876,0.539062,0.25,0.739669,-0.944113,-1.132927,-0.290348,-0.214091
595,0.336082,0.442857,0.177986,0.56094,0.277085,0.600843,-0.214091
133,0.334711,0.217514,0.741379,0.888832,0.016949,0.600843,-0.214091
233,0.341615,0.5,0.201357,1.332614,-0.02792,-0.290348,-0.214091
506,0.532787,0.62987,0.177986,-0.757656,1.132927,0.600843,-0.214091


## Estimator

In [6]:
from lightgbm import LGBMClassifier

model = LGBMClassifier()

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [7]:
%%time
from sklearn.model_selection import cross_val_score
from sklearn.metrics import *

estimator = make_pipeline(fold_prep, model)

cv = 5
objective = 'neg_log_loss'
metric = 'accuracy'

scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=objective)

print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

-0.4780 ± 0.0828
CPU times: user 14.3 s, sys: 3.31 s, total: 17.6 s
Wall time: 6.49 s


## (Fold) Optimizer

In [8]:
%%time
from sklearn.model_selection import GridSearchCV

param = 'featureunion__category__encodercv__encoder__smoothing'
param_space = [1, 5, 10, 50, 100, 150, 200, 250, 300, 500]
space = {param: param_space}

gs = GridSearchCV(estimator, space, cv=cv, scoring=objective)
gs.fit(X_train, y_train)



CPU times: user 3min 15s, sys: 51.4 s, total: 4min 7s
Wall time: 1min 32s


In [9]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_featureunion__category__encodercv__encoder__smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.955264,0.19368,0.341713,0.024875,1,{'featureunion__category__encodercv__encoder__...,-0.492346,-0.629424,-0.405333,-0.474433,...,-0.480619,0.083069,8,-0.135267,-0.11773,-0.132926,-0.133479,-0.138392,-0.131559,0.007173
1,1.056396,0.208835,0.362412,0.005261,5,{'featureunion__category__encodercv__encoder__...,-0.480666,-0.629424,-0.405333,-0.474433,...,-0.478273,0.08287,5,-0.135268,-0.11773,-0.132926,-0.133479,-0.138392,-0.131559,0.007173
2,1.083609,0.149934,0.35766,0.025907,10,{'featureunion__category__encodercv__encoder__...,-0.480666,-0.629424,-0.405333,-0.474433,...,-0.478273,0.08287,5,-0.135268,-0.11773,-0.132926,-0.133479,-0.138392,-0.131559,0.007173
3,1.089239,0.078573,0.3618,0.014518,50,{'featureunion__category__encodercv__encoder__...,-0.517186,-0.630715,-0.386426,-0.474433,...,-0.482092,0.08864,10,-0.137735,-0.11773,-0.139078,-0.133479,-0.138392,-0.133283,0.008018
4,0.952853,0.167342,0.385243,0.036159,100,{'featureunion__category__encodercv__encoder__...,-0.507194,-0.630715,-0.386426,-0.479731,...,-0.481143,0.08789,9,-0.137735,-0.11773,-0.139078,-0.133479,-0.138392,-0.133283,0.008018
5,1.028763,0.185539,0.355319,0.047838,150,{'featureunion__category__encodercv__encoder__...,-0.507194,-0.630715,-0.386426,-0.474433,...,-0.480085,0.087933,7,-0.137735,-0.11773,-0.139078,-0.133479,-0.138392,-0.133283,0.008018
6,1.12907,0.155208,0.358411,0.01508,200,{'featureunion__category__encodercv__encoder__...,-0.481076,-0.630715,-0.388877,-0.474433,...,-0.475228,0.086525,1,-0.136758,-0.11773,-0.137882,-0.133479,-0.137879,-0.132745,0.007679
7,1.157752,0.157898,0.372569,0.0029,250,{'featureunion__category__encodercv__encoder__...,-0.481076,-0.630715,-0.388877,-0.474433,...,-0.475228,0.086525,1,-0.136758,-0.11773,-0.137882,-0.133479,-0.137879,-0.132745,0.007679
8,1.041573,0.132281,0.414848,0.035311,300,{'featureunion__category__encodercv__encoder__...,-0.481076,-0.630715,-0.388877,-0.474433,...,-0.475228,0.086525,1,-0.136758,-0.11773,-0.137882,-0.133479,-0.137879,-0.132745,0.007679
9,1.304625,0.201891,0.358085,0.0487,500,{'featureunion__category__encodercv__encoder__...,-0.483707,-0.630715,-0.388877,-0.474433,...,-0.475756,0.086567,4,-0.136758,-0.11773,-0.137882,-0.133479,-0.137879,-0.132745,0.007679


In [10]:
best_estimator = gs.best_estimator_

scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=metric)
print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

scores = cross_val_score(best_estimator, X_train, y_train, cv=cv, scoring=metric)
print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

0.8261 ± 0.0240
0.8272 ± 0.0203


## Submit (Out-of-Fold & Test Predictions)

In [11]:
%%time
from sklearn.model_selection import cross_val_predict

y_oof = cross_val_predict(best_estimator, X_train, y_train, cv=cv)

y_oof = pd.Series(y_oof, index=X_train.index, name=target)
print(y_oof.head())

PassengerId
1    0
2    1
3    0
4    1
5    0
Name: Survived, dtype: int64
CPU times: user 15 s, sys: 3.72 s, total: 18.7 s
Wall time: 8.35 s


In [12]:
%%time
from sklearn.model_selection import cross_validate

scores = cross_validate(best_estimator, X_train, y_train, cv=cv, return_estimator=True)
y_subs = [estimator.predict_proba(X_test)[:,1] for estimator in scores['estimator']]
y_sub = np.rint(np.mean(y_subs, axis=0)).astype(int)

y_sub = pd.Series(y_sub, index=X_test.index, name=target)
print(y_sub.head())

PassengerId
892    0
893    0
894    0
895    1
896    0
Name: Survived, dtype: int64
CPU times: user 22.8 s, sys: 6.22 s, total: 29 s
Wall time: 11.9 s


In [13]:
import os

path = os.path.join(os.getcwd(), 'pred')
if not os.path.exists(path):
    os.mkdir(path)

sub_path = os.path.join(path, '0 sub baseline.csv')
oof_path = os.path.join(path, '0 oof baseline.csv')

y_sub.to_csv(sub_path, header=True)
y_oof.to_csv(oof_path, header=True)