Translating the R sequences from https://arxiv.org/abs/1611.09477 into Python vtreat https://github.com/WinVector/pyvtreat

Note: for these small examples it is not-determined if the impact/logit codes show up, as they can often have an unlucky cross-validation split.

R original

Python translation

In [1]:
## ----VOpsSimpleDataFrameD------------------------------------------------

In [2]:
import pandas
import numpy
import vtreat # https://github.com/WinVector/pyvtreat

d = pandas.DataFrame({
   'x':['a', 'a', 'b', 'b', numpy.NaN], 
   'z':[0, 1, 2, numpy.NaN, 4], 
   'y':[True, True, False, True, True]
    })
d['yN'] = numpy.asarray(d["y"], dtype=float)
d

Unnamed: 0,x,z,y,yN
0,a,0.0,True,1.0
1,a,1.0,True,1.0
2,b,2.0,False,0.0
3,b,,True,1.0
4,,4.0,True,1.0


In [3]:
d.dtypes

x      object
z     float64
y        bool
yN    float64
dtype: object

In [4]:
## ----VTypesN1, results='hide'--------------------------------------------

In [5]:
treatments = vtreat.NumericOutcomeTreatment(outcome_name='yN',
                                            params = vtreat.vtreat_parameters({
                                               'filter_to_recommended':False
                                            }))
treatments.fit(d[['x', 'z']], d['yN'])

<vtreat.NumericOutcomeTreatment at 0x1a1449d748>

In [6]:
## ----VTypesN1s-----------------------------------------------------------

In [7]:
treatments.score_frame_

Unnamed: 0,variable,treatment,y_aware,has_range,PearsonR,significance,vcount,recommended
0,x_is_bad,missing_indicator,False,True,0.25,0.685038,2.0,False
1,z_is_bad,missing_indicator,False,True,0.25,0.685038,2.0,False
2,z,clean_copy,False,True,-0.094491,0.879869,1.0,False
3,x_impact_code,impact_code,True,True,0.468462,0.426133,1.0,False
4,x_deviance_code,deviance_code,True,True,-0.508002,0.382203,1.0,False
5,x_prevalence_code,prevalence_code,False,True,-0.25,0.685038,1.0,False
6,x_lev_b,indicator_code,False,True,-0.612372,0.272228,3.0,False
7,x_lev_a,indicator_code,False,True,0.408248,0.495025,3.0,False
8,x_lev__NA_,indicator_code,False,True,0.25,0.685038,3.0,False


In [8]:
## ----VTypesN1p-----------------------------------------------------------

In [9]:
dTreated = treatments.transform(d)
dTreated

Unnamed: 0,yN,x_is_bad,z_is_bad,z,x_impact_code,x_deviance_code,x_prevalence_code,x_lev_b,x_lev_a,x_lev__NA_
0,1.0,0.0,0.0,0.0,0.1976562,0.031623,0.2,0,1,0
1,1.0,0.0,0.0,1.0,0.1976562,0.031623,0.2,0,1,0
2,0.0,0.0,0.0,2.0,-0.04322323,0.707814,0.2,1,0,0
3,1.0,0.0,1.0,1.75,-0.04322323,0.707814,0.2,1,0,0
4,1.0,1.0,0.0,4.0,-1.110223e-16,0.500999,0.0,0,0,1


R original

Python translation

In [10]:
## ----VTypesC1, results='hide'--------------------------------------------

In [11]:
treatments = vtreat.BinomialOutcomeTreatment(outcome_name='y',
                                             outcome_target=True,
                                             params = vtreat.vtreat_parameters({
                                               'filter_to_recommended':False
                                             }))
treatments.fit(d[['x', 'z']], d['y'])

<vtreat.BinomialOutcomeTreatment at 0x1a145182e8>

In [12]:
## ----VTypesC1s-----------------------------------------------------------

In [13]:
treatments.score_frame_

Unnamed: 0,variable,treatment,y_aware,has_range,PearsonR,significance,vcount,recommended
0,x_is_bad,missing_indicator,False,True,0.25,0.685038,2.0,False
1,z_is_bad,missing_indicator,False,True,0.25,0.685038,2.0,False
2,z,clean_copy,False,True,-0.094491,0.879869,1.0,False
3,x_logit_code,logit_code,True,True,0.475449,0.418291,1.0,False
4,x_prevalence_code,prevalence_code,False,True,-0.25,0.685038,1.0,False
5,x_lev_b,indicator_code,False,True,-0.612372,0.272228,3.0,False
6,x_lev_a,indicator_code,False,True,0.408248,0.495025,3.0,False
7,x_lev__NA_,indicator_code,False,True,0.25,0.685038,3.0,False


In [14]:
## ----VTypesC1p-----------------------------------------------------------

In [15]:
dTreated = treatments.transform(d)
dTreated

Unnamed: 0,y,x_is_bad,z_is_bad,z,x_logit_code,x_prevalence_code,x_lev_b,x_lev_a,x_lev__NA_
0,True,0.0,0.0,0.0,0.2207971,0.2,0,1,0
1,True,0.0,0.0,1.0,0.2207971,0.2,0,1,0
2,False,0.0,0.0,2.0,-0.05554341,0.2,1,0,0
3,True,0.0,1.0,1.75,-0.05554341,0.2,1,0,0
4,True,1.0,0.0,4.0,-1.387779e-16,0.0,0,0,1


R original

Python translation

In [16]:
## ----VTypesZ1, results='hide'--------------------------------------------

In [17]:
treatments = vtreat.UnsupervisedTreatment()
treatments.fit(d[['x', 'z']])

<vtreat.UnsupervisedTreatment at 0x1a14518630>

In [18]:
## ----VTypesZ1p-----------------------------------------------------------

In [19]:
dTreated = treatments.transform(d)
dTreated

Unnamed: 0,x_is_bad,z_is_bad,z,x_prevalence_code,x_lev_b,x_lev_a,x_lev__NA_
0,0.0,0.0,0.0,0.2,0,1,0
1,0.0,0.0,1.0,0.2,0,1,0
2,0.0,0.0,2.0,0.2,1,0,0
3,0.0,1.0,1.75,0.2,1,0,0
4,1.0,0.0,4.0,0.0,0,0,1


R original

Python translation

In [20]:
treatments = vtreat.NumericOutcomeTreatment(outcome_name='yN',
                                            params = vtreat.vtreat_parameters({
                                               'filter_to_recommended':False
                                            }))
treatments.fit_transform(d[['x', 'z']], d['yN'])

Unnamed: 0,x_is_bad,z_is_bad,z,x_impact_code,x_deviance_code,x_prevalence_code,x_lev_b,x_lev_a,x_lev__NA_
0,0.0,0.0,0.0,0.1976562,0.031623,0.2,0,1,0
1,0.0,0.0,1.0,0.1976562,0.031623,0.2,0,1,0
2,0.0,0.0,2.0,-0.04322323,0.707814,0.2,1,0,0
3,0.0,1.0,1.75,-0.04322323,0.707814,0.2,1,0,0
4,1.0,0.0,4.0,-1.110223e-16,0.500999,0.0,0,0,1


In [21]:
treatments.score_frame_

Unnamed: 0,variable,treatment,y_aware,has_range,PearsonR,significance,vcount,recommended
0,x_is_bad,missing_indicator,False,True,0.25,0.685038,2.0,False
1,z_is_bad,missing_indicator,False,True,0.25,0.685038,2.0,False
2,z,clean_copy,False,True,-0.094491,0.879869,1.0,False
3,x_impact_code,impact_code,True,True,0.468462,0.426133,1.0,False
4,x_deviance_code,deviance_code,True,True,-0.508002,0.382203,1.0,False
5,x_prevalence_code,prevalence_code,False,True,-0.25,0.685038,1.0,False
6,x_lev_b,indicator_code,False,True,-0.612372,0.272228,3.0,False
7,x_lev_a,indicator_code,False,True,0.408248,0.495025,3.0,False
8,x_lev__NA_,indicator_code,False,True,0.25,0.685038,3.0,False


R original

Python translation

In [22]:
## ----VTypesCFN2, results='hide'------------------------------------------

In [23]:
treatments = vtreat.BinomialOutcomeTreatment(outcome_name='y',
                                             outcome_target=True,
                                             params = vtreat.vtreat_parameters({
                                               'filter_to_recommended':False
                                             }))
treatments.fit_transform(d[['x', 'z']], d['y'])

Unnamed: 0,x_is_bad,z_is_bad,z,x_logit_code,x_prevalence_code,x_lev_b,x_lev_a,x_lev__NA_
0,0.0,0.0,0.0,0.2207971,0.2,0,1,0
1,0.0,0.0,1.0,0.2207971,0.2,0,1,0
2,0.0,0.0,2.0,-0.05554341,0.2,1,0,0
3,0.0,1.0,1.75,-0.05554341,0.2,1,0,0
4,1.0,0.0,4.0,-1.387779e-16,0.0,0,0,1


In [24]:
treatments.score_frame_

Unnamed: 0,variable,treatment,y_aware,has_range,PearsonR,significance,vcount,recommended
0,x_is_bad,missing_indicator,False,True,0.25,0.685038,2.0,False
1,z_is_bad,missing_indicator,False,True,0.25,0.685038,2.0,False
2,z,clean_copy,False,True,-0.094491,0.879869,1.0,False
3,x_logit_code,logit_code,True,True,0.475449,0.418291,1.0,False
4,x_prevalence_code,prevalence_code,False,True,-0.25,0.685038,1.0,False
5,x_lev_b,indicator_code,False,True,-0.612372,0.272228,3.0,False
6,x_lev_a,indicator_code,False,True,0.408248,0.495025,3.0,False
7,x_lev__NA_,indicator_code,False,True,0.25,0.685038,3.0,False


R original

Python translation

In [25]:
import vtreat.util

vtreat.util.k_way_cross_plan(10, 3)

[{'train': [0, 2, 3, 4, 6, 8], 'app': [1, 5, 7, 9]},
 {'train': [1, 2, 3, 5, 6, 7, 9], 'app': [0, 4, 8]},
 {'train': [0, 1, 4, 5, 7, 8, 9], 'app': [2, 3, 6]}]

In [26]:
vtreat.util.k_way_cross_plan(2, 1)

[{'train': [0, 1], 'app': [0, 1]}]

In [27]:
vtreat.util.k_way_cross_plan(1, 0)

[{'train': [0], 'app': [0]}]

In [28]:
vtreat.util.k_way_cross_plan(0, 0)

[{'train': [], 'app': []}]

R original

Python translation

In [29]:
# We currently do not have a parallel option for the Python version of vtreat.