Translating the R sequences from https://arxiv.org/abs/1611.09477 into Python vtreat https://github.com/WinVector/pyvtreat

Note: for these small examples it is not-determined if the impact/logit codes show up, as they can often have an unlucky cross-validation split.

R original

Python translation

In [28]:
## ----VOpsSimpleDataFrameD------------------------------------------------

In [29]:
import pandas
import numpy
import vtreat # https://github.com/WinVector/pyvtreat

d = pandas.DataFrame({
   'x':['a', 'a', 'b', 'b', numpy.NaN], 
   'z':[0, 1, 2, numpy.NaN, 4], 
   'y':[True, True, False, True, True]
    })
d['yN'] = numpy.asarray(d["y"], dtype=float)
d

Unnamed: 0,x,z,y,yN
0,a,0.0,True,1.0
1,a,1.0,True,1.0
2,b,2.0,False,0.0
3,b,,True,1.0
4,,4.0,True,1.0


In [30]:
d.dtypes

x      object
z     float64
y        bool
yN    float64
dtype: object

In [31]:
## ----VTypesN1, results='hide'--------------------------------------------

In [32]:
treatments = vtreat.NumericOutcomeTreatment(outcome_name='yN',
                                            params = vtreat.vtreat_parameters({
                                               'filter_to_recommended':False
                                            }))
treatments.fit(d[['x', 'z']], d['yN'])

<vtreat.NumericOutcomeTreatment at 0x1a197e4630>

In [33]:
## ----VTypesN1s-----------------------------------------------------------

In [34]:
treatments.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,significance,vcount,recommended
0,x_is_bad,x,missing_indicator,False,True,0.25,0.685038,2.0,False
1,z_is_bad,z,missing_indicator,False,True,0.25,0.685038,2.0,False
2,z,z,clean_copy,False,True,-0.094491,0.879869,1.0,True
3,x_impact_code,x,impact_code,True,False,,1.0,1.0,False
4,x_deviance_code,x,deviance_code,True,False,,1.0,1.0,False
5,x_prevalence_code,x,prevalence_code,False,True,-0.25,0.685038,1.0,True
6,x_lev_a,x,indicator_code,False,True,0.408248,0.495025,3.0,False
7,x_lev_b,x,indicator_code,False,True,-0.612372,0.272228,3.0,True
8,x_lev__NA_,x,indicator_code,False,True,0.25,0.685038,3.0,False


In [35]:
## ----VTypesN1p-----------------------------------------------------------

In [36]:
dTreated = treatments.transform(d)
dTreated

Unnamed: 0,yN,x_is_bad,z_is_bad,z,x_prevalence_code,x_lev_a,x_lev_b,x_lev__NA_
0,1.0,0.0,0.0,0.0,0.2,1,0,0
1,1.0,0.0,0.0,1.0,0.2,1,0,0
2,0.0,0.0,0.0,2.0,0.2,0,1,0
3,1.0,0.0,1.0,1.75,0.2,0,1,0
4,1.0,1.0,0.0,4.0,0.0,0,0,1


R original

Python translation

In [37]:
## ----VTypesC1, results='hide'--------------------------------------------

In [38]:
treatments = vtreat.BinomialOutcomeTreatment(outcome_name='y',
                                             outcome_target=True,
                                             params = vtreat.vtreat_parameters({
                                               'filter_to_recommended':False
                                             }))
treatments.fit(d[['x', 'z']], d['y'])

<vtreat.BinomialOutcomeTreatment at 0x1a197fd048>

In [39]:
## ----VTypesC1s-----------------------------------------------------------

In [40]:
treatments.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,significance,vcount,recommended
0,x_is_bad,x,missing_indicator,False,True,0.25,0.685038,2.0,False
1,z_is_bad,z,missing_indicator,False,True,0.25,0.685038,2.0,False
2,z,z,clean_copy,False,True,-0.094491,0.879869,1.0,True
3,x_logit_code,x,logit_code,True,False,,1.0,1.0,False
4,x_prevalence_code,x,prevalence_code,False,True,-0.25,0.685038,1.0,True
5,x_lev_a,x,indicator_code,False,True,0.408248,0.495025,3.0,False
6,x_lev_b,x,indicator_code,False,True,-0.612372,0.272228,3.0,True
7,x_lev__NA_,x,indicator_code,False,True,0.25,0.685038,3.0,False


In [41]:
## ----VTypesC1p-----------------------------------------------------------

In [42]:
dTreated = treatments.transform(d)
dTreated

Unnamed: 0,y,x_is_bad,z_is_bad,z,x_prevalence_code,x_lev_a,x_lev_b,x_lev__NA_
0,True,0.0,0.0,0.0,0.2,1,0,0
1,True,0.0,0.0,1.0,0.2,1,0,0
2,False,0.0,0.0,2.0,0.2,0,1,0
3,True,0.0,1.0,1.75,0.2,0,1,0
4,True,1.0,0.0,4.0,0.0,0,0,1


R original

Python translation

In [43]:
## ----VTypesZ1, results='hide'--------------------------------------------

In [44]:
treatments = vtreat.UnsupervisedTreatment()
treatments.fit(d[['x', 'z']])

<vtreat.UnsupervisedTreatment at 0x1a19806c18>

In [45]:
## ----VTypesZ1p-----------------------------------------------------------

In [46]:
dTreated = treatments.transform(d)
dTreated

Unnamed: 0,x_is_bad,z_is_bad,z,x_prevalence_code,x_lev_a,x_lev_b,x_lev__NA_
0,0.0,0.0,0.0,0.2,1,0,0
1,0.0,0.0,1.0,0.2,1,0,0
2,0.0,0.0,2.0,0.2,0,1,0
3,0.0,1.0,1.75,0.2,0,1,0
4,1.0,0.0,4.0,0.0,0,0,1


R original

Python translation

In [47]:
treatments = vtreat.NumericOutcomeTreatment(outcome_name='yN',
                                            params = vtreat.vtreat_parameters({
                                               'filter_to_recommended':False
                                            }))
treatments.fit_transform(d[['x', 'z']], d['yN'])

Unnamed: 0,x_is_bad,z_is_bad,z,x_prevalence_code,x_lev_a,x_lev_b,x_lev__NA_
0,0.0,0.0,0.0,0.2,1,0,0
1,0.0,0.0,1.0,0.2,1,0,0
2,0.0,0.0,2.0,0.2,0,1,0
3,0.0,1.0,1.75,0.2,0,1,0
4,1.0,0.0,4.0,0.0,0,0,1


In [48]:
treatments.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,significance,vcount,recommended
0,x_is_bad,x,missing_indicator,False,True,0.25,0.685038,2.0,False
1,z_is_bad,z,missing_indicator,False,True,0.25,0.685038,2.0,False
2,z,z,clean_copy,False,True,-0.094491,0.879869,1.0,True
3,x_impact_code,x,impact_code,True,False,,1.0,1.0,False
4,x_deviance_code,x,deviance_code,True,False,,1.0,1.0,False
5,x_prevalence_code,x,prevalence_code,False,True,-0.25,0.685038,1.0,True
6,x_lev_a,x,indicator_code,False,True,0.408248,0.495025,3.0,False
7,x_lev_b,x,indicator_code,False,True,-0.612372,0.272228,3.0,True
8,x_lev__NA_,x,indicator_code,False,True,0.25,0.685038,3.0,False


R original

Python translation

In [49]:
## ----VTypesCFN2, results='hide'------------------------------------------

In [50]:
treatments = vtreat.BinomialOutcomeTreatment(outcome_name='y',
                                             outcome_target=True,
                                             params = vtreat.vtreat_parameters({
                                               'filter_to_recommended':False
                                             }))
treatments.fit_transform(d[['x', 'z']], d['y'])

Unnamed: 0,x_is_bad,z_is_bad,z,x_prevalence_code,x_lev_a,x_lev_b,x_lev__NA_
0,0.0,0.0,0.0,0.2,1,0,0
1,0.0,0.0,1.0,0.2,1,0,0
2,0.0,0.0,2.0,0.2,0,1,0
3,0.0,1.0,1.75,0.2,0,1,0
4,1.0,0.0,4.0,0.0,0,0,1


In [51]:
treatments.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,significance,vcount,recommended
0,x_is_bad,x,missing_indicator,False,True,0.25,0.685038,2.0,False
1,z_is_bad,z,missing_indicator,False,True,0.25,0.685038,2.0,False
2,z,z,clean_copy,False,True,-0.094491,0.879869,1.0,True
3,x_logit_code,x,logit_code,True,False,,1.0,1.0,False
4,x_prevalence_code,x,prevalence_code,False,True,-0.25,0.685038,1.0,True
5,x_lev_a,x,indicator_code,False,True,0.408248,0.495025,3.0,False
6,x_lev_b,x,indicator_code,False,True,-0.612372,0.272228,3.0,True
7,x_lev__NA_,x,indicator_code,False,True,0.25,0.685038,3.0,False


R original

Python translation

In [52]:
import vtreat.cross_plan

vtreat.cross_plan.k_way_cross_plan(10, 3)

[{'train': [0, 1, 3, 7, 8, 9], 'app': [2, 4, 5, 6]},
 {'train': [2, 3, 4, 5, 6, 8, 9], 'app': [0, 1, 7]},
 {'train': [0, 1, 2, 4, 5, 6, 7], 'app': [3, 8, 9]}]

In [53]:
vtreat.cross_plan.k_way_cross_plan(2, 1)

[{'train': [0, 1], 'app': [0, 1]}]

In [54]:
vtreat.cross_plan.k_way_cross_plan(1, 0)

[{'train': [0], 'app': [0]}]

In [55]:
vtreat.cross_plan.k_way_cross_plan(0, 0)

[{'train': [], 'app': []}]

R original

Python translation

In [56]:
# We currently do not have a parallel option for the Python version of vtreat.