# Import

In [1]:
# Basic
import pandas as pd
import numpy as np

import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.preprocessing import *
from robusta.pipeline import *
from robusta.crossval import *

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *

# Model
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

%load_ext memory_profiler

Using TensorFlow backend.


# Data

In [2]:
TARGET = 'income'

from catboost.datasets import adult
from sklearn.preprocessing import LabelBinarizer

train, test = adult()

# Target
labels_train = train['income']
labels_test = test['income']

train.drop(columns='income', inplace=True)
test.drop(columns='income', inplace=True)

# Target Binarization
y_train = labels_train.astype('category').cat.codes
y_test  = labels_test.astype('category').cat.codes

del labels_train, labels_test

In [3]:
train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States
32557,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States
32558,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States
32559,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States


In [4]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Length: 32561, dtype: int8

# Task & Models

In [5]:
cv = 5
scoring = 'roc_auc'

In [6]:
logreg = LogisticRegression()
lgb = LGBMClassifier()

# Unsupervised Encoding

## Inbuilt Encoder

In [7]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        Categorizer(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [8]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [9]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:04]  LGBMClassifier

[16:12:04]  FOLD  0:   0.9255
[16:12:05]  FOLD  1:   0.9240
[16:12:05]  FOLD  2:   0.9297
[16:12:06]  FOLD  3:   0.9298
[16:12:06]  FOLD  4:   0.9303

[16:12:06]  AVERAGE:   [33m0.9279[0m ± 0.0026



0.9279709821283886

## One-Hot-Encoder

In [10]:
from dask_ml.preprocessing import DummyEncoder

prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        Categorizer(),
        DummyEncoder(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [11]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,education_11th,...,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [12]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:07]  LGBMClassifier

[16:12:08]  FOLD  0:   0.9271
[16:12:08]  FOLD  1:   0.9243
[16:12:09]  FOLD  2:   0.9282
[16:12:09]  FOLD  3:   0.9292
[16:12:10]  FOLD  4:   0.9304

[16:12:10]  AVERAGE:   [33m0.9278[0m ± 0.0021



0.9279098843889422

In [13]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:10]  LogisticRegression

[16:12:10]  FOLD  0:   0.9042
[16:12:11]  FOLD  1:   0.8998
[16:12:11]  FOLD  2:   0.9082
[16:12:11]  FOLD  3:   0.9108
[16:12:12]  FOLD  4:   0.9115

[16:12:12]  AVERAGE:   [33m0.9069[0m ± 0.0044



0.9054884985910093

## Label Encoder

In [14]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        LabelEncoder(),
        DowncastTransformer(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [15]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,6,9,4,0,1,4,1,38,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,5,9,2,3,0,4,1,38,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,3,11,0,5,1,4,1,38,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,3,1,2,5,0,2,1,38,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,3,9,2,9,5,2,0,4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,3,7,2,12,5,4,0,38,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,3,11,2,6,0,4,1,38,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,3,11,6,0,4,4,0,38,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,3,11,4,0,3,4,1,38,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [16]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:12]  LGBMClassifier

[16:12:13]  FOLD  0:   0.9262
[16:12:13]  FOLD  1:   0.9234
[16:12:13]  FOLD  2:   0.9278
[16:12:14]  FOLD  3:   0.9286
[16:12:14]  FOLD  4:   0.9294

[16:12:14]  AVERAGE:   [33m0.9271[0m ± 0.0021



0.9273066435323275

In [17]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:14]  LogisticRegression

[16:12:15]  FOLD  0:   0.8426
[16:12:15]  FOLD  1:   0.8506
[16:12:15]  FOLD  2:   0.8550
[16:12:15]  FOLD  3:   0.8565
[16:12:15]  FOLD  4:   0.8646

[16:12:15]  AVERAGE:   [33m0.8539[0m ± 0.0072



0.8509506114060406

## Frequency Encoder

In [18]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        FrequencyEncoder(),
        SimpleImputer(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [19]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.042246,0.164461,0.328092,0.122729,0.255060,0.854274,0.669205,0.912190,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.082701,0.164461,0.459937,0.132365,0.405178,0.854274,0.669205,0.912190,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.738682,0.322502,0.136452,0.044599,0.255060,0.854274,0.669205,0.912190,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.738682,0.036086,0.459937,0.044599,0.405178,0.095943,0.669205,0.912190,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.738682,0.164461,0.459937,0.134774,0.048156,0.095943,0.330795,0.002971,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.738682,0.032769,0.459937,0.030210,0.048156,0.854274,0.330795,0.912190,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.738682,0.322502,0.459937,0.065174,0.405178,0.854274,0.669205,0.912190,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.738682,0.322502,0.030497,0.122729,0.105832,0.854274,0.330795,0.912190,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.738682,0.322502,0.328092,0.122729,0.155646,0.854274,0.669205,0.912190,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [20]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:16]  LGBMClassifier

[16:12:16]  FOLD  0:   0.9257
[16:12:16]  FOLD  1:   0.9235
[16:12:17]  FOLD  2:   0.9282
[16:12:17]  FOLD  3:   0.9294
[16:12:17]  FOLD  4:   0.9294

[16:12:18]  AVERAGE:   [33m0.9273[0m ± 0.0023



0.9275420538333397

In [21]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:18]  LogisticRegression

[16:12:18]  FOLD  0:   0.8782
[16:12:18]  FOLD  1:   0.8758
[16:12:18]  FOLD  2:   0.8847
[16:12:18]  FOLD  3:   0.8848
[16:12:19]  FOLD  4:   0.8870

[16:12:19]  AVERAGE:   [33m0.8821[0m ± 0.0043



0.883590395485542

## SVDEncoder

In [22]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        SimpleImputer('mode'),
        SVDEncoder(0.99),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [23]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,"(workclass,education)_svd1","(workclass,education)_svd2","(workclass,education)_svd3","(workclass,education)_svd4","(workclass,marital-status)_svd1","(workclass,marital-status)_svd2","(workclass,marital-status)_svd3","(workclass,occupation)_svd1","(workclass,occupation)_svd2","(workclass,occupation)_svd3",...,"(native-country,relationship)_svd2","(native-country,race)_svd1","(native-country,race)_svd2","(native-country,sex)_svd1",age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,-0.044457,-0.462690,0.323057,0.490931,-0.052138,0.064967,0.271657,-0.054514,-0.406027,0.256636,...,-0.022181,-0.999686,-0.000088,-0.999634,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,-0.098856,-0.087365,-0.897789,0.107221,-0.109943,0.817447,-0.180208,-0.097113,0.531386,0.734411,...,-0.022181,-0.999686,-0.000088,-0.999634,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,0.066011,-0.003428,0.005474,-0.003017,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [24]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:20]  LGBMClassifier

[16:12:21]  FOLD  0:   0.9256
[16:12:23]  FOLD  1:   0.9251
[16:12:24]  FOLD  2:   0.9282
[16:12:26]  FOLD  3:   0.9287
[16:12:28]  FOLD  4:   0.9296

[16:12:28]  AVERAGE:   [33m0.9274[0m ± 0.0018



0.9275317663289563

In [25]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:28]  LogisticRegression

[16:12:31]  FOLD  0:   0.9031
[16:12:34]  FOLD  1:   0.8990
[16:12:37]  FOLD  2:   0.9064
[16:12:41]  FOLD  3:   0.9086
[16:12:44]  FOLD  4:   0.9110

[16:12:44]  AVERAGE:   [33m0.9056[0m ± 0.0042



0.9039062093243682

# Supervised Encoding

## Naive Bayes Encoder

In [26]:
from dask_ml.preprocessing import OneHotEncoder

prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        Categorizer(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [27]:
fold_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("category"),
        DummyEncoder(),
        NaiveBayesEncoder(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
    )),
])

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,education_11th,...,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.0,0.0,0.0,-0.000000,0.000000,0.000000,0.171882,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.0,0.0,0.0,-0.000000,0.000000,0.231795,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-1.698886,...,0.0,0.027329,-0.0,0.0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.000000,-0.0,0.0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [28]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:45]  LGBMClassifier

[16:12:46]  FOLD  0:   0.9271
[16:12:48]  FOLD  1:   0.9243
[16:12:49]  FOLD  2:   0.9282
[16:12:50]  FOLD  3:   0.9292
[16:12:51]  FOLD  4:   0.9304

[16:12:51]  AVERAGE:   [33m0.9278[0m ± 0.0021



0.9279115571538825

In [29]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:51]  LogisticRegression

[16:12:52]  FOLD  0:   0.9044
[16:12:53]  FOLD  1:   0.8995
[16:12:54]  FOLD  2:   0.9082
[16:12:55]  FOLD  3:   0.9102
[16:12:56]  FOLD  4:   0.9116

[16:12:56]  AVERAGE:   [33m0.9068[0m ± 0.0044



0.9054412848005677

## Target Encoder

In [30]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [31]:
fold_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        FastEncoder(),
        SimpleImputer('median'),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
    )),
])

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.271957,0.414753,0.045961,0.134483,0.103070,0.25586,0.305737,0.245835,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.284927,0.414753,0.446848,0.484014,0.448571,0.25586,0.305737,0.245835,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.218673,0.159509,0.104209,0.062774,0.103070,0.25586,0.305737,0.245835,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.218673,0.051064,0.446848,0.062774,0.448571,0.12388,0.305737,0.245835,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.218673,0.414753,0.446848,0.449034,0.475128,0.12388,0.109461,0.263158,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.218673,0.248360,0.446848,0.304957,0.475128,0.25586,0.109461,0.245835,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.218673,0.159509,0.446848,0.124875,0.448571,0.25586,0.305737,0.245835,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.218673,0.159509,0.085599,0.134483,0.063262,0.25586,0.109461,0.245835,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.218673,0.159509,0.045961,0.134483,0.013220,0.25586,0.305737,0.245835,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [32]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:12:57]  LGBMClassifier

[16:12:58]  FOLD  0:   0.9260
[16:12:59]  FOLD  1:   0.9253
[16:13:00]  FOLD  2:   0.9291
[16:13:01]  FOLD  3:   0.9295
[16:13:02]  FOLD  4:   0.9295

[16:13:02]  AVERAGE:   [33m0.9279[0m ± 0.0018



0.9278804646355537

In [33]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:13:02]  LogisticRegression

[16:13:03]  FOLD  0:   0.8996
[16:13:04]  FOLD  1:   0.8960
[16:13:05]  FOLD  2:   0.9038
[16:13:05]  FOLD  3:   0.9046
[16:13:06]  FOLD  4:   0.9069

[16:13:06]  AVERAGE:   [33m0.9022[0m ± 0.0039



0.903143637607185

## CatBoost Encoder

In [34]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [35]:
fold_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        CatBoostEncoder(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
    )),
])

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.271933,0.414720,0.045979,0.134511,0.103087,0.255859,0.305734,0.245835,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.284910,0.414720,0.446835,0.483954,0.448555,0.255859,0.305734,0.245835,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.218674,0.159516,0.104240,0.062904,0.103087,0.255859,0.305734,0.245835,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.218674,0.051225,0.446835,0.062904,0.448555,0.123917,0.305734,0.245835,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.218674,0.414720,0.446835,0.448984,0.474978,0.123917,0.109473,0.262925,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.218674,0.248353,0.446835,0.304888,0.474978,0.255859,0.109473,0.245835,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.218674,0.159516,0.446835,0.124933,0.448555,0.255859,0.305734,0.245835,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.218674,0.159516,0.085755,0.134511,0.063313,0.255859,0.109473,0.245835,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.218674,0.159516,0.045979,0.134511,0.013265,0.255859,0.305734,0.245835,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [36]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:13:07]  LGBMClassifier

[16:13:09]  FOLD  0:   0.9263
[16:13:10]  FOLD  1:   0.9248
[16:13:12]  FOLD  2:   0.9290
[16:13:13]  FOLD  3:   0.9293
[16:13:15]  FOLD  4:   0.9290

[16:13:15]  AVERAGE:   [33m0.9277[0m ± 0.0018



0.9277661834257851

In [37]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[16:13:15]  LogisticRegression

[16:13:16]  FOLD  0:   0.8996
[16:13:17]  FOLD  1:   0.8960
[16:13:18]  FOLD  2:   0.9038
[16:13:19]  FOLD  3:   0.9046
[16:13:20]  FOLD  4:   0.9069

[16:13:20]  AVERAGE:   [33m0.9022[0m ± 0.0039



0.9031477567908506

# Supervised CV Encoding

In [47]:
from joblib import Parallel, delayed
import multiprocessing

from sklearn.base import clone, BaseEstimator, TransformerMixin
from sklearn.model_selection import check_cv

from sklearn.utils.multiclass import type_of_target


class EncoderCV(BaseEstimator, TransformerMixin):
    """Cross Encoder for supervised encoders.

    Parameters
    ----------
    encoder : encoder instance, default=TargetEncoder()

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

            - None, to use the default 3-fold cross-validation,
            - integer, to specify the number of folds.
            - CV splitter,
            - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if classifier is True and y is either binary or
        multiclass, StratifiedKFold is used. In all other cases, KFold is used.

    n_jobs : int or None, optional (default=-1)
        The number of jobs to run in parallel for both fit and transform. None
        means 1 unless in a joblib.parallel_backend context. -1 means using all
        processors.

    verbose : int, optional (default=0)
        Controls the verbosity level.

    """
    def __init__(self, encoder, cv=5, n_jobs=None):
        self.encoder = encoder
        self.cv = cv
        self.n_jobs = n_jobs


    def fit(self, X, y, groups=None):
        
        self.cv_ = self._check_cv(y)

        self.encoders_ = Parallel(n_jobs=self.n_jobs)(
            delayed(self._fit)(clone(self.encoder), X, y, trn)
            for trn, oof in self.cv_.split(X, y, groups))
        
        return self
    
    
    def fit_transform(self, X, y, groups=None):
        
        self.cv_ = self._check_cv(y)

        preds = Parallel(n_jobs=self.n_jobs)(
            delayed(self._fit_transform)(clone(self.encoder), X, y, trn, oof)
            for trn, oof in self.cv_.split(X, y, groups))
        
        return self._mean_preds(preds)[X.columns]


    def transform(self, X):
        
        preds = Parallel(n_jobs=self.n_jobs)(
            delayed(self._transform)(encoder, X)
            for encoder in self.encoders_)
        
        return self._mean_preds(preds)[X.columns]


    def _mean_preds(self, preds):
        return pd.concat(preds, axis=1).groupby(level=0, axis=1).mean()


    def _fit(self, encoder, X, y, trn):
        return encoder.fit(X.iloc[trn], y.iloc[trn])
    
    
    def _fit_transform(self, encoder, X, y, trn, oof):
        return encoder.fit(X.iloc[trn], y.iloc[trn]).transform(X.iloc[oof])
    
    
    def _transform(self, encoder, X):
        return encoder.transform(X)

    
    def _check_cv(self, y):

        task_type = type_of_target(y)
        
        if task_type == 'binary':
            classifier = True
        elif task_type == 'continuous':
            classifier = False
        else:
            raise ValueError("Unsupported task type '{}'".format(task_type))

        return check_cv(self.cv, y, classifier)

In [48]:
EncoderCV(FastEncoder()).fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.267686,0.418123,0.046206,0.136650,0.103890,0.255661,0.307378,0.246274,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.285785,0.418123,0.445167,0.482219,0.448823,0.255661,0.307378,0.246274,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.217800,0.155037,0.105737,0.061662,0.103890,0.255661,0.307378,0.246274,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.217800,0.050429,0.445167,0.061662,0.448823,0.123437,0.307378,0.246274,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.217800,0.418123,0.445167,0.447416,0.463277,0.123437,0.106755,0.302632,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.217915,0.250297,0.447476,0.311050,0.478639,0.255935,0.109213,0.245921,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.217915,0.160655,0.447476,0.122884,0.448778,0.255935,0.305756,0.245921,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.217915,0.160655,0.079602,0.132965,0.060727,0.255935,0.109213,0.245921,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.217915,0.160655,0.048155,0.132965,0.013834,0.255935,0.305756,0.245921,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [49]:
FastEncoder().fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.271957,0.414753,0.045961,0.134483,0.103070,0.25586,0.305737,0.245835,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.284927,0.414753,0.446848,0.484014,0.448571,0.25586,0.305737,0.245835,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.218673,0.159509,0.104209,0.062774,0.103070,0.25586,0.305737,0.245835,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.218673,0.051064,0.446848,0.062774,0.448571,0.12388,0.305737,0.245835,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.218673,0.414753,0.446848,0.449034,0.475128,0.12388,0.109461,0.263158,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.218673,0.248360,0.446848,0.304957,0.475128,0.25586,0.109461,0.245835,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.218673,0.159509,0.446848,0.124875,0.448571,0.25586,0.305737,0.245835,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.218673,0.159509,0.085599,0.134483,0.063262,0.25586,0.109461,0.245835,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.218673,0.159509,0.045961,0.134483,0.013220,0.25586,0.305737,0.245835,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225
