# Import

In [1]:
# Basic
import pandas as pd
import numpy as np

import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.preprocessing import *
from robusta.pipeline import *
from robusta.crossval import *

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *

# Model
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

%load_ext memory_profiler

Using TensorFlow backend.


# Data

In [2]:
TARGET = 'income'

from catboost.datasets import adult
from sklearn.preprocessing import LabelBinarizer

train, test = adult()

# Target
labels_train = train['income']
labels_test = test['income']

train.drop(columns='income', inplace=True)
test.drop(columns='income', inplace=True)

# Target Binarization
y_train = labels_train.astype('category').cat.codes
y_test  = labels_test.astype('category').cat.codes

del labels_train, labels_test

In [3]:
train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States
32557,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States
32558,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States
32559,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States


In [4]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Length: 32561, dtype: int8

# Task & Models

In [5]:
cv = 5
scoring = 'roc_auc'

In [6]:
logreg = LogisticRegression()
lgb = LGBMClassifier()

# Unsupervised Encoding

## Inbuilt Encoder

In [7]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        Categorizer(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [8]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [9]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:17:51]  LGBMClassifier

[17:17:51]  FOLD  0:   0.9255
[17:17:52]  FOLD  1:   0.9240
[17:17:52]  FOLD  2:   0.9297
[17:17:53]  FOLD  3:   0.9298
[17:17:53]  FOLD  4:   0.9303

[17:17:54]  AVERAGE:   [33m0.9279[0m ± 0.0026



0.9279709821283886

## One-Hot-Encoder

In [10]:
from dask_ml.preprocessing import DummyEncoder

prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        Categorizer(),
        DummyEncoder(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [11]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,education_11th,...,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [12]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:17:54]  LGBMClassifier

[17:17:54]  FOLD  0:   0.9271
[17:17:55]  FOLD  1:   0.9243
[17:17:56]  FOLD  2:   0.9282
[17:17:56]  FOLD  3:   0.9292
[17:17:57]  FOLD  4:   0.9304

[17:17:57]  AVERAGE:   [33m0.9278[0m ± 0.0021



0.9279098843889422

In [13]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:17:57]  LogisticRegression

[17:17:57]  FOLD  0:   0.9042
[17:17:58]  FOLD  1:   0.8998
[17:17:58]  FOLD  2:   0.9082
[17:17:58]  FOLD  3:   0.9108
[17:17:58]  FOLD  4:   0.9115

[17:17:59]  AVERAGE:   [33m0.9069[0m ± 0.0044



0.9054884985910093

## Label Encoder

In [14]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        LabelEncoder(),
        DowncastTransformer(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [15]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,6,9,4,0,1,4,1,38,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,5,9,2,3,0,4,1,38,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,3,11,0,5,1,4,1,38,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,3,1,2,5,0,2,1,38,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,3,9,2,9,5,2,0,4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,3,7,2,12,5,4,0,38,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,3,11,2,6,0,4,1,38,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,3,11,6,0,4,4,0,38,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,3,11,4,0,3,4,1,38,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [16]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:17:59]  LGBMClassifier

[17:17:59]  FOLD  0:   0.9262
[17:17:59]  FOLD  1:   0.9234
[17:18:00]  FOLD  2:   0.9278
[17:18:00]  FOLD  3:   0.9286
[17:18:00]  FOLD  4:   0.9294

[17:18:01]  AVERAGE:   [33m0.9271[0m ± 0.0021



0.9273066435323275

In [17]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:01]  LogisticRegression

[17:18:01]  FOLD  0:   0.8426
[17:18:01]  FOLD  1:   0.8506
[17:18:01]  FOLD  2:   0.8550
[17:18:01]  FOLD  3:   0.8565
[17:18:01]  FOLD  4:   0.8646

[17:18:01]  AVERAGE:   [33m0.8539[0m ± 0.0072



0.8509506114060406

## Frequency Encoder

In [18]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        FrequencyEncoder(),
        SimpleImputer(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [19]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.042246,0.164461,0.328092,0.122729,0.255060,0.854274,0.669205,0.912190,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.082701,0.164461,0.459937,0.132365,0.405178,0.854274,0.669205,0.912190,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.738682,0.322502,0.136452,0.044599,0.255060,0.854274,0.669205,0.912190,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.738682,0.036086,0.459937,0.044599,0.405178,0.095943,0.669205,0.912190,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.738682,0.164461,0.459937,0.134774,0.048156,0.095943,0.330795,0.002971,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.738682,0.032769,0.459937,0.030210,0.048156,0.854274,0.330795,0.912190,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.738682,0.322502,0.459937,0.065174,0.405178,0.854274,0.669205,0.912190,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.738682,0.322502,0.030497,0.122729,0.105832,0.854274,0.330795,0.912190,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.738682,0.322502,0.328092,0.122729,0.155646,0.854274,0.669205,0.912190,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [20]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:02]  LGBMClassifier

[17:18:02]  FOLD  0:   0.9257
[17:18:02]  FOLD  1:   0.9235
[17:18:03]  FOLD  2:   0.9282
[17:18:03]  FOLD  3:   0.9294
[17:18:03]  FOLD  4:   0.9294

[17:18:03]  AVERAGE:   [33m0.9273[0m ± 0.0023



0.9275420538333397

In [21]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:04]  LogisticRegression

[17:18:04]  FOLD  0:   0.8782
[17:18:04]  FOLD  1:   0.8758
[17:18:04]  FOLD  2:   0.8847
[17:18:04]  FOLD  3:   0.8848
[17:18:04]  FOLD  4:   0.8870

[17:18:04]  AVERAGE:   [33m0.8821[0m ± 0.0043



0.883590395485542

## SVDEncoder

In [22]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        SimpleImputer('mode'),
        SVDEncoder(0.99),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [23]:
fold_pipe = Identity()

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,"(workclass,education)_svd1","(workclass,education)_svd2","(workclass,education)_svd3","(workclass,education)_svd4","(workclass,marital-status)_svd1","(workclass,marital-status)_svd2","(workclass,marital-status)_svd3","(workclass,occupation)_svd1","(workclass,occupation)_svd2","(workclass,occupation)_svd3",...,"(native-country,relationship)_svd2","(native-country,race)_svd1","(native-country,race)_svd2","(native-country,sex)_svd1",age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,-0.044457,-0.462690,0.323057,0.490931,-0.052138,0.064967,0.271657,-0.054514,-0.406027,0.256636,...,-0.022181,-0.999686,-0.000088,-0.999634,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,-0.098856,-0.087365,-0.897789,0.107221,-0.109943,0.817447,-0.180208,-0.097113,0.531386,0.734411,...,-0.022181,-0.999686,-0.000088,-0.999634,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,0.066011,-0.003428,0.005474,-0.003017,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-0.989940,0.107946,0.081374,-0.023666,-0.987102,-0.145197,-0.062052,-0.988299,0.024462,-0.147787,...,-0.022181,-0.999686,-0.000088,-0.999634,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [24]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:05]  LGBMClassifier

[17:18:07]  FOLD  0:   0.9256
[17:18:08]  FOLD  1:   0.9251
[17:18:10]  FOLD  2:   0.9282
[17:18:11]  FOLD  3:   0.9287
[17:18:13]  FOLD  4:   0.9296

[17:18:13]  AVERAGE:   [33m0.9274[0m ± 0.0018



0.9275317663289563

In [25]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:13]  LogisticRegression

[17:18:16]  FOLD  0:   0.9031
[17:18:18]  FOLD  1:   0.8990
[17:18:21]  FOLD  2:   0.9064
[17:18:23]  FOLD  3:   0.9086
[17:18:26]  FOLD  4:   0.9110

[17:18:26]  AVERAGE:   [33m0.9056[0m ± 0.0042



0.9039062093243682

# Supervised Encoding

## Naive Bayes Encoder

In [26]:
from dask_ml.preprocessing import OneHotEncoder

prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        Categorizer(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [27]:
fold_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("category"),
        DummyEncoder(),
        NaiveBayesEncoder(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
    )),
])

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,education_11th,...,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.0,0.0,0.0,-0.000000,0.000000,0.000000,0.171882,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.0,0.0,0.0,-0.000000,0.000000,0.231795,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-1.698886,...,0.0,0.027329,-0.0,0.0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.000000,-0.0,0.0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.0,0.0,0.0,-0.124881,0.000000,0.000000,0.000000,-0.0,-0.0,-0.000000,...,0.0,0.027329,-0.0,0.0,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [28]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:26]  LGBMClassifier

[17:18:27]  FOLD  0:   0.9271
[17:18:28]  FOLD  1:   0.9243
[17:18:29]  FOLD  2:   0.9282
[17:18:30]  FOLD  3:   0.9292
[17:18:30]  FOLD  4:   0.9304

[17:18:31]  AVERAGE:   [33m0.9278[0m ± 0.0021



0.9279115571538825

In [29]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:31]  LogisticRegression

[17:18:31]  FOLD  0:   0.9044
[17:18:32]  FOLD  1:   0.8995
[17:18:32]  FOLD  2:   0.9082
[17:18:33]  FOLD  3:   0.9102
[17:18:34]  FOLD  4:   0.9116

[17:18:34]  AVERAGE:   [33m0.9068[0m ± 0.0044



0.9054412848005677

## Target Encoder

In [30]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [31]:
fold_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        FastEncoder(),
        SimpleImputer('median'),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
    )),
])

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.271957,0.414753,0.045961,0.134483,0.103070,0.25586,0.305737,0.245835,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.284927,0.414753,0.446848,0.484014,0.448571,0.25586,0.305737,0.245835,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.218673,0.159509,0.104209,0.062774,0.103070,0.25586,0.305737,0.245835,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.218673,0.051064,0.446848,0.062774,0.448571,0.12388,0.305737,0.245835,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.218673,0.414753,0.446848,0.449034,0.475128,0.12388,0.109461,0.263158,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.218673,0.248360,0.446848,0.304957,0.475128,0.25586,0.109461,0.245835,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.218673,0.159509,0.446848,0.124875,0.448571,0.25586,0.305737,0.245835,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.218673,0.159509,0.085599,0.134483,0.063262,0.25586,0.109461,0.245835,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.218673,0.159509,0.045961,0.134483,0.013220,0.25586,0.305737,0.245835,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [32]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:34]  LGBMClassifier

[17:18:35]  FOLD  0:   0.9260
[17:18:35]  FOLD  1:   0.9253
[17:18:36]  FOLD  2:   0.9291
[17:18:36]  FOLD  3:   0.9295
[17:18:37]  FOLD  4:   0.9295

[17:18:37]  AVERAGE:   [33m0.9279[0m ± 0.0018



0.9278804646355537

In [33]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:37]  LogisticRegression

[17:18:38]  FOLD  0:   0.8996
[17:18:38]  FOLD  1:   0.8960
[17:18:38]  FOLD  2:   0.9038
[17:18:39]  FOLD  3:   0.9046
[17:18:39]  FOLD  4:   0.9069

[17:18:39]  AVERAGE:   [33m0.9022[0m ± 0.0039



0.903143637607185

## CatBoost Encoder

In [34]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [35]:
fold_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        CatBoostEncoder(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
    )),
])

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.240810,0.240810,0.240810,0.240810,0.240810,0.240810,0.240810,0.240810,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.240810,0.120405,0.240810,0.240810,0.240810,0.120405,0.120405,0.120405,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,0.240810,0.240810,0.240810,0.240810,0.120405,0.080270,0.080270,0.080270,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,0.120405,0.240810,0.120405,0.120405,0.120405,0.240810,0.060202,0.060202,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,0.080270,0.080270,0.080270,0.240810,0.240810,0.120405,0.240810,0.240810,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.218668,0.248586,0.446790,0.305216,0.474946,0.255833,0.109410,0.245808,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.218659,0.159387,0.446761,0.124496,0.448514,0.255824,0.305716,0.245800,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,0.218693,0.159467,0.085842,0.134582,0.063332,0.255851,0.109400,0.245826,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,0.218684,0.159452,0.045983,0.134547,0.013268,0.255842,0.305748,0.245817,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [36]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:40]  LGBMClassifier

[17:18:41]  FOLD  0:   0.9232
[17:18:42]  FOLD  1:   0.9239
[17:18:43]  FOLD  2:   0.9264
[17:18:45]  FOLD  3:   0.9273
[17:18:46]  FOLD  4:   0.9272

[17:18:46]  AVERAGE:   [33m0.9256[0m ± 0.0017



0.926483737274702

In [37]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:46]  LogisticRegression

[17:18:47]  FOLD  0:   0.8996
[17:18:48]  FOLD  1:   0.8959
[17:18:49]  FOLD  2:   0.9036
[17:18:49]  FOLD  3:   0.9044
[17:18:50]  FOLD  4:   0.9069

[17:18:50]  AVERAGE:   [33m0.9021[0m ± 0.0039



0.9029801875629508

# Supervised CV Encoding

In [38]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
        PandasTransformer(StandardScaler()),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

In [39]:
fold_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        EncoderCV(WOEEncoder()),
        SimpleImputer('median'),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
    )),
])

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.144024,0.817827,-1.876766,-0.693200,-1.005290,0.079629,0.335856,0.029688,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.233246,0.817827,0.927974,1.077048,0.942769,0.079629,0.335856,0.029688,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.130195,-0.546847,-0.984550,-1.560846,-1.005290,0.079629,0.335856,0.029688,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,-0.130195,-1.767360,0.927974,-1.560846,0.942769,-0.809320,0.335856,0.029688,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.130195,0.817827,0.927974,0.937167,1.001236,-0.809320,-0.975219,0.337226,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.129680,0.054113,0.937160,0.355182,1.062637,0.080907,-0.949871,0.027629,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,-0.129680,-0.504769,0.937160,-0.813027,0.942426,0.080907,0.328067,0.027629,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,-0.129680,-0.504769,-1.285617,-0.724931,-1.585133,0.080907,-0.949871,0.027629,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-0.129680,-0.504769,-1.833681,-0.724931,-3.101250,0.080907,0.328067,0.027629,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [40]:
estimator = make_pipeline(fold_pipe, lgb)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:18:52]  LGBMClassifier

[17:18:53]  FOLD  0:   0.9257
[17:18:55]  FOLD  1:   0.9235
[17:18:58]  FOLD  2:   0.9293
[17:18:59]  FOLD  3:   0.9284
[17:19:01]  FOLD  4:   0.9296

[17:19:02]  AVERAGE:   [33m0.9273[0m ± 0.0023



0.9275041970717831

In [41]:
estimator = make_pipeline(fold_pipe, logreg)

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[17:19:02]  LogisticRegression

[17:19:03]  FOLD  0:   0.8995
[17:19:05]  FOLD  1:   0.8968
[17:19:06]  FOLD  2:   0.9036
[17:19:07]  FOLD  3:   0.9053
[17:19:09]  FOLD  4:   0.9075

[17:19:09]  AVERAGE:   [33m0.9025[0m ± 0.0039



0.9032467844753196