# Import

In [1]:
# Basic
import pandas as pd
import numpy as np

import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.preprocessing import *
from robusta.pipeline import *
from robusta.crossval import *
from sklearn.metrics import *

# Model
from lightgbm import LGBMClassifier

%load_ext memory_profiler

Using TensorFlow backend.


# Data

In [2]:
TARGET = 'income'

from catboost.datasets import adult
from sklearn.preprocessing import LabelBinarizer

train, test = adult()

labels_train = train['income']
labels_test = test['income']

X_train = train.drop(columns='income')
X_test = test.drop(columns='income')

# Binarization
y_train = labels_train.astype('category').cat.codes
y_test  = labels_test.astype('category').cat.codes

del train, test, labels_train, labels_test

In [3]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States
32557,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States
32558,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States
32559,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States


In [4]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Length: 32561, dtype: int8

# Inbuilt Encoder

In [41]:
fold_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        Categorizer(),
    )),
])

model = LGBMClassifier()

estimator = make_pipeline(fold_pipe, model)

fold_pipe.fit_transform(X_train, y_train)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [40]:
cv = 5
scoring = 'roc_auc'

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[08:14:19]  LGBMClassifier

[08:14:20]  FOLD  0:   0.8784
[08:14:21]  FOLD  1:   0.8718
[08:14:22]  FOLD  2:   0.8789
[08:14:23]  FOLD  3:   0.8856
[08:14:24]  FOLD  4:   0.8816

[08:14:24]  AVERAGE:   [33m0.8792[0m ± 0.0045



0.8793269776629424

# Target Encoder

In [53]:
from robusta.preprocessing.category import EncoderCV

fold_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        EncoderCV(TargetEncoder()),
    )),
])

model = LGBMClassifier()

estimator = make_pipeline(fold_pipe, model)

fold_pipe.fit_transform(X_train, y_train)

ValueError: Internal error. Please save traceback and inform developers.

In [50]:
cv = 5
scoring = 'roc_auc'

_, y_pred = crossval_predict(estimator, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[08:16:12]  LGBMClassifier

[08:16:13]  FOLD  0:   0.8767
[08:16:14]  FOLD  1:   0.8716
[08:16:15]  FOLD  2:   0.8781
[08:16:16]  FOLD  3:   0.8850
[08:16:17]  FOLD  4:   0.8806

[08:16:17]  AVERAGE:   [33m0.8784[0m ± 0.0044



0.8793923618625485