# Import

In [1]:
# Basic
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.crossval import *
from robusta.pipeline import *
from robusta.preprocessing import *
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score

# Data
from catboost.datasets import adult

# Model
from lightgbm import LGBMClassifier

Using TensorFlow backend.


# Data

In [2]:
TARGET = 'income'

train, test = adult()

# Target
labels_train = train['income']
labels_test = test['income']

train.drop(columns='income', inplace=True)
test.drop(columns='income', inplace=True)

# Target Binarization
y_train = labels_train.astype('category').cat.codes
y_test  = labels_test.astype('category').cat.codes

del labels_train, labels_test

In [3]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States


# Preprocessing

In [4]:
prep_pipe = FeatureUnion([
    ("category", make_pipeline(
        TypeSelector("object"),
        Categorizer(),
    )),
    ("numeric", make_pipeline(
        TypeSelector(np.number),
    )),
])

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

# CV schemes

## Simple Holdout

In [5]:
cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
model = LGBMClassifier()
scoring = 'roc_auc'

_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[20:23:34]  LGBMClassifier

[20:23:35]  FOLD  0:   0.9265

[20:23:35]  AVERAGE:   [33m0.9265[0m ± 0.0000



0.9266619076504115

## Adversarial Validation

In [9]:
cv = make_adversarial_validation(model, X_train, X_test, test_size=0.2)
model = LGBMClassifier()
scoring = 'roc_auc'

_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[20:24:05]  LGBMClassifier

[20:24:05]  FOLD  0:   0.9328

[20:24:06]  AVERAGE:   [33m0.9328[0m ± 0.0000



0.9260441555579393

In [11]:
help(cv)

Help on AdversarialValidation in module robusta.crossval.schemes object:

class AdversarialValidation(builtins.object)
 |  Adversarial Validation
 |  
 |  Holdout split by the train/test similarity. Inner ``classifier`` must be
 |  already fitted to the concatenated dataset with binary target, where 1 means
 |  test set and 0 – train set. Provides list with single train/oof indices,
 |  where oof – subset of size ``test_size`` with maximum class 1 probability.
 |  
 |  Parameters
 |  ----------
 |  classifier : estimator object
 |      Fitted estimator for train/test similarity measurement.
 |      Class 1 for test set and 0 for train.
 |  
 |  train_size : float, int, or None (default=None)
 |      If float, should be between 0.0 and 1.0 and represent the
 |      proportion of the dataset to include in the train split. If
 |      int, represents the absolute number of train samples. If None,
 |      the value is automatically set to the complement of the test size.
 |  
 |  test_size 

## KFold(5)

In [7]:
cv = 5
model = LGBMClassifier()
scoring = 'roc_auc'

_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[20:23:36]  LGBMClassifier

[20:23:37]  FOLD  0:   0.9258
[20:23:37]  FOLD  1:   0.9245
[20:23:37]  FOLD  2:   0.9299
[20:23:38]  FOLD  3:   0.9295
[20:23:38]  FOLD  4:   0.9307

[20:23:39]  AVERAGE:   [33m0.9281[0m ± 0.0024



0.9280748399216228