# Import

In [1]:
# Basic
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')

# Data
from catboost.datasets import adult

# ML Toolkit
from robusta.preprocessing import *
from robusta.pipeline import *
from robusta.crossval import *
from sklearn.metrics import *

# Model
from robusta.testing import get_estimator

%load_ext memory_profiler

Using TensorFlow backend.


# Data

In [2]:
TARGET = 'income'

train, test = adult()

# Target
y_train = train.pop(TARGET)
y_test = test.pop(TARGET)

# Target Binarization
lb = LabelBinarizer()

y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [3]:
train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States
32557,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States
32558,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States
32559,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States


In [4]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: income, Length: 32561, dtype: uint8

# Task & Model

In [5]:
cv = 5
scoring = 'roc_auc'

In [6]:
estimator = get_estimator('LogisticRegression')
estimator

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# Baseline

In [7]:
prep_pipe = make_pipeline(
    TypeSelector(np.number),
)

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

X_train

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39.0,77516.0,13.0,2174.0,0.0,40.0
1,50.0,83311.0,13.0,0.0,0.0,13.0
2,38.0,215646.0,9.0,0.0,0.0,40.0
3,53.0,234721.0,7.0,0.0,0.0,40.0
4,28.0,338409.0,13.0,0.0,0.0,40.0
...,...,...,...,...,...,...
32556,27.0,257302.0,12.0,0.0,0.0,38.0
32557,40.0,154374.0,9.0,0.0,0.0,40.0
32558,58.0,151910.0,9.0,0.0,0.0,40.0
32559,22.0,201490.0,9.0,0.0,0.0,20.0


In [8]:
_ = crossval(estimator, cv, X_train, y_train, scoring=scoring)

[04:34:28]  LogisticRegression

[04:34:28]  FOLD  0:   0.5852
[04:34:29]  FOLD  1:   0.6043
[04:34:29]  FOLD  2:   0.5825
[04:34:29]  FOLD  3:   0.5690
[04:34:29]  FOLD  4:   0.5891

[04:34:29]  AVERAGE:   [33m0.5860[0m ± 0.0114



# One-Hot-Encoding

In [9]:
prep_pipe = make_pipeline(
    TypeSelector(np.number),
    KBinsDiscretizer(20),
    DummyEncoder()
)

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

X_train

Unnamed: 0,"age_(17.0, 19.0]","age_(19.0, 22.0]","age_(22.0, 24.0]","age_(24.0, 26.0]","age_(26.0, 28.0]","age_(28.0, 30.0]","age_(30.0, 31.0]","age_(31.0, 33.0]","age_(33.0, 35.0]","age_(35.0, 37.0]",...,"hours-per-week_(18.0, 24.0]","hours-per-week_(24.0, 30.0]","hours-per-week_(30.0, 35.0]","hours-per-week_(35.0, 40.0]","hours-per-week_(40.0, 45.0]","hours-per-week_(45.0, 48.0]","hours-per-week_(48.0, 50.0]","hours-per-week_(50.0, 55.0]","hours-per-week_(55.0, 60.0]","hours-per-week_(60.0, 99.0]"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
32557,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
32558,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
32559,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
_ = crossval(estimator, cv, X_train, y_train, scoring=scoring)

[04:34:29]  LogisticRegression

[04:34:29]  FOLD  0:   0.8494
[04:34:29]  FOLD  1:   0.8470
[04:34:29]  FOLD  2:   0.8568
[04:34:29]  FOLD  3:   0.8554
[04:34:29]  FOLD  4:   0.8598

[04:34:30]  AVERAGE:   [33m0.8537[0m ± 0.0047



# Thermometer Encoding

In [11]:
prep_pipe = make_pipeline(
    TypeSelector(np.number),
    KBinsDiscretizer(20),
    ThermometerEncoder(),
)

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

X_train

Unnamed: 0,"age:(17.0, 19.0]","age:(19.0, 22.0]","age:(22.0, 24.0]","age:(24.0, 26.0]","age:(26.0, 28.0]","age:(28.0, 30.0]","age:(30.0, 31.0]","age:(31.0, 33.0]","age:(33.0, 35.0]","age:(35.0, 37.0]",...,"hours-per-week:(18.0, 24.0]","hours-per-week:(24.0, 30.0]","hours-per-week:(30.0, 35.0]","hours-per-week:(35.0, 40.0]","hours-per-week:(40.0, 45.0]","hours-per-week:(45.0, 48.0]","hours-per-week:(48.0, 50.0]","hours-per-week:(50.0, 55.0]","hours-per-week:(55.0, 60.0]","hours-per-week:(60.0, 99.0]"
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,0,0,0,0,0
4,1,1,1,1,1,0,0,0,0,0,...,1,1,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1,1,1,1,1,0,0,0,0,0,...,1,1,1,1,0,0,0,0,0,0
32557,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,0,0,0,0,0
32558,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,0,0,0,0,0
32559,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [12]:
_ = crossval(estimator, cv, X_train, y_train, scoring=scoring)

[04:34:30]  LogisticRegression

[04:34:31]  FOLD  0:   0.8497
[04:34:31]  FOLD  1:   0.8473
[04:34:31]  FOLD  2:   0.8570
[04:34:31]  FOLD  3:   0.8556
[04:34:31]  FOLD  4:   0.8600

[04:34:31]  AVERAGE:   [33m0.8539[0m ± 0.0047

