Example of using [`vtreat`](https://github.com/WinVector/pyvtreat) inside a sklearn pipeline.

First we load packages/modules.

In [46]:
import pandas
import numpy
import numpy.random
import vtreat
import vtreat.util
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

Set our pseudorandom state.

In [2]:
numpy.random.seed(2019)

Build our training data.  It is designed to be a data set with categorical variables where common levels are informative and rare levels are not.  So setting at what rarity levels are encoded can be useful.

In [28]:
def make_data(nrows, 
              *,
              ncols=10,
              n_common_levels=5,
              n_rare_levels=10,
              rare_ratio=0.3,
              noise_magnitude=3.3,
              na_rate=0.1):
    # build a system of more common levels, which have signal,
    # and rare levels, which do not have signal
    common_levels = ['c_' + str(i) for i in range(n_common_levels)]
    rare_levels = ['r_' + str(i) for i in range(n_rare_levels)]
    levels = common_levels + rare_levels
    probs = numpy.asarray([1.0 / len(common_levels)] * len(common_levels) + 
                          [rare_ratio / len(rare_levels)] * len(rare_levels))
    probs = probs / sum(probs)
    effects = numpy.random.choice(
        [-1, 1], 
        size = len(common_levels), 
        replace=True).tolist() + [0]*len(rare_levels)
    effects = {li: ei for (li, ei) in zip(levels, effects)}
    # use this to populate up a data frame
    d = pandas.DataFrame({
        'x_' + str(i): numpy.random.choice(levels, 
                                           size=nrows, 
                                           replace=True, 
                                           p=probs) for i in range(ncols)
    })
    # build y
    y = noise_magnitude * numpy.random.normal(size=nrows)
    for i in range(ncols):
        y = y + d[d.columns[i]].map(effects)
    # introduce some NaNs
    if na_rate > 0:
        for i in range(ncols):
            idx = numpy.where(
                numpy.random.choice([False, True], 
                                    size=nrows, 
                                    replace=True, 
                                    p=[1 - na_rate, na_rate]))[0]
            if len(idx) > 0:
                d.loc[idx, d.columns[i]] = numpy.nan
    return d, y > 0

d_x, d_y = make_data(500)

In [29]:
d_x.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9
0,c_0,r_7,c_0,c_4,c_2,r_8,c_3,c_3,c_2,r_5
1,c_0,c_0,,r_8,,c_0,,c_4,r_4,c_2
2,c_1,c_0,c_2,c_4,c_2,r_8,r_9,r_5,,c_1
3,r_7,c_2,c_2,c_2,r_8,c_3,c_2,c_3,,r_0
4,c_0,r_3,r_1,c_3,c_1,c_2,c_0,c_2,r_2,c_3


In [None]:
d_y.head()

In [None]:
transform = vtreat.BinomialOutcomeTreatment(
    outcome_target=True,
    params = {
        'filter_to_recommended': False,
        'indicator_min_fraction': 0.01,
    }
)

clf = Pipeline(steps=[
    ('preprocessor', transform),
    ('classifier', LogisticRegression(solver = 'lbfgs'))]
)

X_train, X_test, y_train, y_test = train_test_split(d_x, d_y, test_size=0.2)

Set up a [cross-validated grid search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) for hyper parameters, including the indicator coding strategy.

In [None]:
parameters = {
    'preprocessor__indicator_min_fraction': [0.01, 0.1], 
    'classifier__C': [0.1, 1, 10]
}

cgm = GridSearchCV(clf, parameters)

In [None]:
cgm.fit(X_train, y_train)

In [None]:
clf.fit(X_train, y_train)

In [None]:
transform.score_frame_

In [None]:
sum(clf.predict(X_test))

In [None]:
clf.score(X_test, y_test)

In [None]:
sum(y_test)

In [None]:
clf.score(X_train, y_train)

The above fit is an over-fit (not achievable without data leakage). Notice vtreat gave as a warning.

In [None]:
print(clf)

In [None]:
print(transform.get_feature_names())

In [None]:
print(transform.get_params())


In [None]:
print(clf.get_params())


0    False
1    False
2     True
3     True
4     True
dtype: bool

In [47]:
transform = vtreat.BinomialOutcomeTreatment(
    outcome_target=True,
    params = {
        'filter_to_recommended': False,
        'indicator_min_fraction': 0.01,
    }
)

clf = Pipeline(steps=[
    ('preprocessor', transform),
    ('classifier', LogisticRegression(solver = 'lbfgs'))]
)

X_train, X_test, y_train, y_test = train_test_split(d_x, d_y, test_size=0.2)

Set up a [cross-validated grid search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) for hyper parameters, including the indicator coding strategy.

In [49]:
parameters = {
    'preprocessor__indicator_min_fraction': [0.01, 0.1], 
    'classifier__C': [0.1, 1, 10]
}

cgm = GridSearchCV(clf, parameters)

In [50]:
cgm.fit(X_train, y_train)

TypeError: __init__() got an unexpected keyword argument 'indicator_min_fraction'

In [None]:
clf.fit(X_train, y_train)

In [41]:
transform.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,R2,significance,vcount,default_threshold,recommended
0,x_0_is_bad,x_0,missing_indicator,False,True,-0.002085,0.000005,0.966685,10.0,0.025000,False
1,x_1_is_bad,x_1,missing_indicator,False,True,0.056129,0.003484,0.282761,10.0,0.025000,False
2,x_2_is_bad,x_2,missing_indicator,False,True,0.009422,0.000105,0.851933,10.0,0.025000,False
3,x_3_is_bad,x_3,missing_indicator,False,True,0.093783,0.009219,0.080603,10.0,0.025000,False
4,x_4_is_bad,x_4,missing_indicator,False,True,-0.027012,0.000934,0.578109,10.0,0.025000,False
...,...,...,...,...,...,...,...,...,...,...,...
183,x_9_lev_r_8,x_9,indicator_code,False,True,0.007594,0.000067,0.881575,158.0,0.001582,False
184,x_9_lev_r_3,x_9,indicator_code,False,True,0.007594,0.000067,0.881575,158.0,0.001582,False
185,x_9_lev_r_4,x_9,indicator_code,False,True,0.017574,0.000341,0.736983,158.0,0.001582,False
186,x_9_lev_r_1,x_9,indicator_code,False,True,-0.041389,0.003805,0.261654,158.0,0.001582,False


In [42]:
sum(clf.predict(X_test))

7

In [43]:
clf.score(X_test, y_test)

0.91

In [44]:
sum(y_test)

8

In [45]:
clf.score(X_train, y_train)

(this causes over-fit, please use fit_transform() instead)
  "possibly called transform on same data used to fit\n" +


0.8825

The above fit is an over-fit (not achievable without data leakage). Notice vtreat gave as a warning.

In [None]:
print(clf)

In [37]:
print(transform.get_feature_names())

['x_0_is_bad', 'x_1_is_bad', 'x_2_is_bad', 'x_3_is_bad', 'x_4_is_bad', 'x_5_is_bad', 'x_6_is_bad', 'x_7_is_bad', 'x_8_is_bad', 'x_9_is_bad', 'x_0_logit_code', 'x_0_prevalence_code', 'x_0_lev_c_3', 'x_0_lev_c_2', 'x_0_lev_c_0', 'x_0_lev_c_1', 'x_0_lev_c_4', 'x_0_lev__NA_', 'x_1_logit_code', 'x_1_prevalence_code', 'x_1_lev_c_0', 'x_1_lev_c_3', 'x_1_lev_c_2', 'x_1_lev_c_4', 'x_1_lev_c_1', 'x_2_logit_code', 'x_2_prevalence_code', 'x_2_lev_c_1', 'x_2_lev_c_2', 'x_2_lev_c_4', 'x_2_lev_c_3', 'x_3_logit_code', 'x_3_prevalence_code', 'x_3_lev_c_2', 'x_3_lev_c_3', 'x_3_lev_c_0', 'x_3_lev_c_1', 'x_3_lev_c_4', 'x_4_logit_code', 'x_4_prevalence_code', 'x_4_lev_c_3', 'x_4_lev_c_4', 'x_4_lev_c_1', 'x_4_lev_c_0', 'x_4_lev_c_2', 'x_5_logit_code', 'x_5_prevalence_code', 'x_5_lev_c_3', 'x_5_lev_c_2', 'x_5_lev_c_1', 'x_5_lev_c_0', 'x_5_lev__NA_', 'x_5_lev_c_4', 'x_6_logit_code', 'x_6_prevalence_code', 'x_6_lev_c_1', 'x_6_lev_c_3', 'x_6_lev_c_0', 'x_6_lev_c_2', 'x_6_lev_c_4', 'x_7_logit_code', 'x_7_prevale

In [38]:
print(transform.get_params())


{'indicator_min_fraction': 0.1}


In [39]:
print(clf.get_params())


{'memory': None, 'steps': [('preprocessor', vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True, )), ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False))], 'verbose': False, 'preprocessor': vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True, ), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), 'preprocessor__indicator_min_fraction': 0.1, 'classifier__C': 1.0, 'classifier__class_weight': None, '