diff --git a/DESCRIPTION b/DESCRIPTION index 5179ab0a..5a330052 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: vtreat Type: Package Title: A Statistically Sound 'data.frame' Processor/Conditioner -Version: 1.4.5 -Date: 2019-09-11 +Version: 1.4.6 +Date: 2019-09-22 Authors@R: c( person("John", "Mount", email = "jmount@win-vector.com", role = c("aut", "cre")), person("Nina", "Zumel", email = "nzumel@win-vector.com", role = c("aut")), diff --git a/README.md b/README.md index c26e0401..ae1698c1 100644 --- a/README.md +++ b/README.md @@ -274,7 +274,7 @@ Trivial example: ``` r library("vtreat") packageVersion("vtreat") - # [1] '1.4.5' + # [1] '1.4.6' citation('vtreat') # # To cite package 'vtreat' in publications use: @@ -304,14 +304,14 @@ dTestC <- data.frame(x=c('a', 'b', 'c', NA), z=c(10, 20, 30, NA)) treatmentsC <- designTreatmentsC(dTrainC, colnames(dTrainC), 'y', TRUE, verbose=FALSE) print(treatmentsC$scoreFrame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')]) - # origName varName code rsq sig extraModelDegrees - # 1 x x_catP catP 1.030137e-01 0.32099590 2 - # 2 x x_catB catB 1.125399e-05 0.99172381 2 - # 3 z z clean 2.376018e-01 0.13176020 0 - # 4 z z_isBAD isBAD 2.960654e-01 0.09248399 0 - # 5 x x_lev_NA lev 2.960654e-01 0.09248399 0 - # 6 x x_lev_x_a lev 1.300057e-01 0.26490379 0 - # 7 x x_lev_x_b lev 6.067337e-03 0.80967242 0 + # origName varName code rsq sig extraModelDegrees + # 1 x x_catP catP 0.111456141 0.30194137 2 + # 2 x x_catB catB 0.033761011 0.56994212 2 + # 3 z z clean 0.237601767 0.13176020 0 + # 4 z z_isBAD isBAD 0.296065432 0.09248399 0 + # 5 x x_lev_NA lev 0.296065432 0.09248399 0 + # 6 x x_lev_x_a lev 0.130005705 0.26490379 0 + # 7 x x_lev_x_b lev 0.006067337 0.80967242 0 # help("prepare") @@ -349,9 +349,9 @@ treatmentsN = designTreatmentsN(dTrainN, colnames(dTrainN), 'y', verbose=FALSE) print(treatmentsN$scoreFrame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')]) # origName varName code rsq sig extraModelDegrees - # 1 x x_catP catP 2.105263e-01 0.2528101 2 - # 2 x x_catN catN 3.205128e-03 0.8940756 2 - # 3 x x_catD catD 6.666667e-02 0.5369633 2 + # 1 x x_catP catP 2.197309e-01 0.2413478 2 + # 2 x x_catN catN 7.286735e-02 0.5179131 2 + # 3 x x_catD catD 2.227248e-01 0.2377286 2 # 4 z z clean 2.880952e-01 0.1701892 0 # 5 z z_isBAD isBAD 3.333333e-01 0.1339746 0 # 6 x x_lev_NA lev 3.333333e-01 0.1339746 0 diff --git a/cran-comments.md b/cran-comments.md index 14c670c2..1bb18b9a 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -6,34 +6,34 @@ ### OSX - R CMD check --as-cran vtreat_1.4.5.tar.gz + R CMD check --as-cran vtreat_1.4.6.tar.gz * using R version 3.6.0 (2019-04-26) * using platform: x86_64-apple-darwin15.6.0 (64-bit) * using session charset: UTF-8 * using option ‘--as-cran’ * checking for file ‘vtreat/DESCRIPTION’ ... OK * checking extension type ... Package - * this is package ‘vtreat’ version ‘1.4.5’ + * this is package ‘vtreat’ version ‘1.4.6’ * checking CRAN incoming feasibility ... Note_to_CRAN_maintainers Maintainer: ‘John Mount ’ Status: OK + ### Windows rhub::check_for_cran() - 595#> * using R Under development (unstable) (2019-08-30 r77101) - 596#> * using platform: x86_64-w64-mingw32 (64-bit) - 597#> * using session charset: ISO8859-1 - 598#> * using option '--as-cran' - 599#> * checking for file 'vtreat/DESCRIPTION' ... OK - 600#> * checking extension type ... Package - 601#> * this is package 'vtreat' version '1.4.5' - 602#> * checking CRAN incoming feasibility ... Note_to_CRAN_maintainers - 603#> Maintainer: 'John Mount ' - 645#> * checking sizes of PDF files under 'inst/doc' ... NOTE - 646#> Unable to find GhostScript executable to run checks on size reduction - 660#> * DONE - 661#> Status: 1 NOTE + 623#> * using R Under development (unstable) (2019-09-18 r77193) + 624#> * using platform: x86_64-w64-mingw32 (64-bit) + 625#> * using session charset: ISO8859-1 + 626#> * using option '--as-cran' + 627#> * checking for file 'vtreat/DESCRIPTION' ... OK + 628#> * checking extension type ... Package + 629#> * this is package 'vtreat' version '1.4.6' + 630#> * checking CRAN incoming feasibility ... Note_to_CRAN_maintainers + 631#> Maintainer: 'John Mount ' + 673#> * checking sizes of PDF files under 'inst/doc' ... NOTE + 674#> Unable to find GhostScript executable to run checks on size reduction + 689#> Status: 1 NOTE GhostScript NOTE is a property of the test environment, not the package. ## Downstream dependencies diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html index 87ba787c..17569226 100644 --- a/docs/CONTRIBUTING.html +++ b/docs/CONTRIBUTING.html @@ -64,7 +64,7 @@ vtreat - 1.4.5 + 1.4.6 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 125a151f..5b1e5b96 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -64,7 +64,7 @@ vtreat - 1.4.5 + 1.4.6 diff --git a/docs/articles/MultiClassVtreat.html b/docs/articles/MultiClassVtreat.html index ad446153..1900c60d 100644 --- a/docs/articles/MultiClassVtreat.html +++ b/docs/articles/MultiClassVtreat.html @@ -30,7 +30,7 @@ vtreat - 1.4.5 + 1.4.6 @@ -113,7 +113,7 @@

Multi Class vtreat

John Mount

-

2019-09-11

+

2019-09-22

Source: vignettes/MultiClassVtreat.Rmd diff --git a/docs/articles/SavingTreamentPlans.html b/docs/articles/SavingTreamentPlans.html index ed3fbe76..0fd4a5e5 100644 --- a/docs/articles/SavingTreamentPlans.html +++ b/docs/articles/SavingTreamentPlans.html @@ -30,7 +30,7 @@ vtreat - 1.4.5 + 1.4.6 @@ -113,7 +113,7 @@

Saving Treatment Plans

John Mount

-

2019-09-11

+

2019-09-22

Source: vignettes/SavingTreamentPlans.Rmd diff --git a/docs/articles/VariableImportance.html b/docs/articles/VariableImportance.html index c9c788f5..4b32b388 100644 --- a/docs/articles/VariableImportance.html +++ b/docs/articles/VariableImportance.html @@ -30,7 +30,7 @@ vtreat - 1.4.5 + 1.4.6 @@ -113,7 +113,7 @@

vtreat Variable Importance

John Mount

-

2019-09-11

+

2019-09-22

Source: vignettes/VariableImportance.Rmd @@ -142,9 +142,9 @@

2019-09-11

d, varlist = c("x", "x_noise"), outcomename = "y") -
## [1] "vtreat 1.4.5 start initial treatment design Wed Sep 11 08:19:33 2019"
-## [1] " start cross frame work Wed Sep 11 08:19:33 2019"
-## [1] " vtreat::mkCrossFrameNExperiment done Wed Sep 11 08:19:33 2019"
+
## [1] "vtreat 1.4.6 start initial treatment design Sun Sep 22 15:17:05 2019"
+## [1] " start cross frame work Sun Sep 22 15:17:05 2019"
+## [1] " vtreat::mkCrossFrameNExperiment done Sun Sep 22 15:17:05 2019"
sf <- cfe$treatments$scoreFrame
 knitr::kable(sf[, c("varName", "rsq", "sig")])
diff --git a/docs/articles/index.html b/docs/articles/index.html index 054e7725..d697ceea 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -64,7 +64,7 @@ vtreat - 1.4.5 + 1.4.6 diff --git a/docs/articles/vtreat.html b/docs/articles/vtreat.html index cd6d9230..829a5777 100644 --- a/docs/articles/vtreat.html +++ b/docs/articles/vtreat.html @@ -30,7 +30,7 @@ vtreat - 1.4.5 + 1.4.6 @@ -113,7 +113,7 @@

vtreat package

John Mount, Nina Zumel

-

2019-09-11

+

2019-09-22

Source: vignettes/vtreat.Rmd @@ -208,7 +208,7 @@

2019-09-11

Trivial example:

library("vtreat")
 packageVersion("vtreat")
-
## [1] '1.4.5'
+
## [1] '1.4.6'
citation('vtreat')
## 
 ## To cite package 'vtreat' in publications use:
@@ -237,14 +237,14 @@ 

2019-09-11

treatmentsC <- designTreatmentsC(dTrainC, colnames(dTrainC), 'y', TRUE, verbose=FALSE) print(treatmentsC$scoreFrame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')])
-
##   origName   varName  code         rsq        sig extraModelDegrees
-## 1        x    x_catP  catP 0.060049677 0.44862725                 2
-## 2        x    x_catB  catB 0.127625394 0.26932340                 2
-## 3        z         z clean 0.237601767 0.13176020                 0
-## 4        z   z_isBAD isBAD 0.296065432 0.09248399                 0
-## 5        x  x_lev_NA   lev 0.296065432 0.09248399                 0
-## 6        x x_lev_x_a   lev 0.130005705 0.26490379                 0
-## 7        x x_lev_x_b   lev 0.006067337 0.80967242                 0
+
##   origName   varName  code          rsq        sig extraModelDegrees
+## 1        x    x_catP  catP 1.559780e-01 0.22202097                 2
+## 2        x    x_catB  catB 1.142159e-05 0.99166241                 2
+## 3        z         z clean 2.376018e-01 0.13176020                 0
+## 4        z   z_isBAD isBAD 2.960654e-01 0.09248399                 0
+## 5        x  x_lev_NA   lev 2.960654e-01 0.09248399                 0
+## 6        x x_lev_x_a   lev 1.300057e-01 0.26490379                 0
+## 7        x x_lev_x_b   lev 6.067337e-03 0.80967242                 0
# help("prepare")
 
 dTrainCTreated <- prepare(treatmentsC, dTrainC, pruneSig=1.0, scale=TRUE)
@@ -278,9 +278,9 @@ 

2019-09-11

verbose=FALSE) print(treatmentsN$scoreFrame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')])
##   origName   varName  code          rsq       sig extraModelDegrees
-## 1        x    x_catP  catP 2.105263e-01 0.2528101                 2
-## 2        x    x_catN  catN 4.310345e-03 0.8772535                 2
-## 3        x    x_catD  catD 2.302479e-01 0.2288609                 2
+## 1        x    x_catP  catP 1.764706e-01 0.3001022                 2
+## 2        x    x_catN  catN 3.533569e-03 0.8888051                 2
+## 3        x    x_catD  catD 3.743113e-01 0.1069707                 2
 ## 4        z         z clean 2.880952e-01 0.1701892                 0
 ## 5        z   z_isBAD isBAD 3.333333e-01 0.1339746                 0
 ## 6        x  x_lev_NA   lev 3.333333e-01 0.1339746                 0
diff --git a/docs/articles/vtreatCrossFrames.html b/docs/articles/vtreatCrossFrames.html
index 149ecaea..3eeaed0d 100644
--- a/docs/articles/vtreatCrossFrames.html
+++ b/docs/articles/vtreatCrossFrames.html
@@ -30,7 +30,7 @@
       
       
         vtreat
-        1.4.5
+        1.4.6
       
     
 
@@ -113,7 +113,7 @@
       

vtreat cross frames

John Mount, Nina Zumel

-

2019-09-11

+

2019-09-22

Source: vignettes/vtreatCrossFrames.Rmd @@ -166,13 +166,13 @@

'y',TRUE, rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem )

-
## [1] "vtreat 1.4.5 inspecting inputs Wed Sep 11 08:19:38 2019"
-## [1] "designing treatments Wed Sep 11 08:19:38 2019"
-## [1] " have initial level statistics Wed Sep 11 08:19:38 2019"
-## [1] " scoring treatments Wed Sep 11 08:19:39 2019"
-## [1] "have treatment plan Wed Sep 11 08:19:39 2019"
-## [1] "rescoring complex variables Wed Sep 11 08:19:39 2019"
-## [1] "done rescoring complex variables Wed Sep 11 08:19:39 2019"
+
## [1] "vtreat 1.4.6 inspecting inputs Sun Sep 22 15:17:10 2019"
+## [1] "designing treatments Sun Sep 22 15:17:10 2019"
+## [1] " have initial level statistics Sun Sep 22 15:17:10 2019"
+## [1] " scoring treatments Sun Sep 22 15:17:11 2019"
+## [1] "have treatment plan Sun Sep 22 15:17:11 2019"
+## [1] "rescoring complex variables Sun Sep 22 15:17:11 2019"
+## [1] "done rescoring complex variables Sun Sep 22 15:17:11 2019"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
   pruneSig=c() # Note: usually want pruneSig to be a small fraction, setting to null to illustrate problems
 )
@@ -250,13 +250,13 @@ 

'y',TRUE, rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem )

-
## [1] "vtreat 1.4.5 inspecting inputs Wed Sep 11 08:19:39 2019"
-## [1] "designing treatments Wed Sep 11 08:19:39 2019"
-## [1] " have initial level statistics Wed Sep 11 08:19:39 2019"
-## [1] " scoring treatments Wed Sep 11 08:19:39 2019"
-## [1] "have treatment plan Wed Sep 11 08:19:39 2019"
-## [1] "rescoring complex variables Wed Sep 11 08:19:39 2019"
-## [1] "done rescoring complex variables Wed Sep 11 08:19:39 2019"
+
## [1] "vtreat 1.4.6 inspecting inputs Sun Sep 22 15:17:11 2019"
+## [1] "designing treatments Sun Sep 22 15:17:11 2019"
+## [1] " have initial level statistics Sun Sep 22 15:17:11 2019"
+## [1] " scoring treatments Sun Sep 22 15:17:11 2019"
+## [1] "have treatment plan Sun Sep 22 15:17:11 2019"
+## [1] "rescoring complex variables Sun Sep 22 15:17:11 2019"
+## [1] "done rescoring complex variables Sun Sep 22 15:17:11 2019"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
   pruneSig=pruneSig)
 newvars <- setdiff(colnames(dTrainTreated),'y')
@@ -361,9 +361,9 @@ 

'y',TRUE, rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problems )

-
## [1] "vtreat 1.4.5 start initial treatment design Wed Sep 11 08:19:40 2019"
-## [1] " start cross frame work Wed Sep 11 08:19:40 2019"
-## [1] " vtreat::mkCrossFrameCExperiment done Wed Sep 11 08:19:41 2019"
+
## [1] "vtreat 1.4.6 start initial treatment design Sun Sep 22 15:17:12 2019"
+## [1] " start cross frame work Sun Sep 22 15:17:12 2019"
+## [1] " vtreat::mkCrossFrameCExperiment done Sun Sep 22 15:17:13 2019"
treatments <- prep$treatments
 
 knitr::kable(treatments$scoreFrame[,c('varName','sig')])
diff --git a/docs/articles/vtreatGrouping.html b/docs/articles/vtreatGrouping.html index 6a5e84ad..8d6f8aa5 100644 --- a/docs/articles/vtreatGrouping.html +++ b/docs/articles/vtreatGrouping.html @@ -30,7 +30,7 @@ vtreat - 1.4.5 + 1.4.6 @@ -113,7 +113,7 @@

Grouping Example

Nina Zumel, Nate Sutton

-

2019-09-11

+

2019-09-22

Source: vignettes/vtreatGrouping.Rmd diff --git a/docs/articles/vtreatOverfit.html b/docs/articles/vtreatOverfit.html index 8cd4532b..4f99afcc 100644 --- a/docs/articles/vtreatOverfit.html +++ b/docs/articles/vtreatOverfit.html @@ -30,7 +30,7 @@ vtreat - 1.4.5 + 1.4.6 @@ -113,7 +113,7 @@

vtreat overfit

John Mount, Nina Zumel

-

2019-09-11

+

2019-09-22

Source: vignettes/vtreatOverfit.Rmd @@ -145,13 +145,13 @@

treatments <- vtreat::designTreatmentsC(dTrain,'x','y',TRUE, rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem ) -
## [1] "vtreat 1.4.5 inspecting inputs Wed Sep 11 08:19:45 2019"
-## [1] "designing treatments Wed Sep 11 08:19:45 2019"
-## [1] " have initial level statistics Wed Sep 11 08:19:45 2019"
-## [1] " scoring treatments Wed Sep 11 08:19:45 2019"
-## [1] "have treatment plan Wed Sep 11 08:19:45 2019"
-## [1] "rescoring complex variables Wed Sep 11 08:19:45 2019"
-## [1] "done rescoring complex variables Wed Sep 11 08:19:45 2019"
+
## [1] "vtreat 1.4.6 inspecting inputs Sun Sep 22 15:17:17 2019"
+## [1] "designing treatments Sun Sep 22 15:17:17 2019"
+## [1] " have initial level statistics Sun Sep 22 15:17:17 2019"
+## [1] " scoring treatments Sun Sep 22 15:17:17 2019"
+## [1] "have treatment plan Sun Sep 22 15:17:17 2019"
+## [1] "rescoring complex variables Sun Sep 22 15:17:17 2019"
+## [1] "done rescoring complex variables Sun Sep 22 15:17:17 2019"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
   pruneSig=c() # Note: usually want pruneSig to be a small fraction, setting to null to illustrate problem
 )
@@ -245,13 +245,13 @@ 

rareCount=0, # Note set this to something larger, like 5 rareSig=c() # Note set this to something like 0.3 )

-
## [1] "vtreat 1.4.5 inspecting inputs Wed Sep 11 08:19:45 2019"
-## [1] "designing treatments Wed Sep 11 08:19:45 2019"
-## [1] " have initial level statistics Wed Sep 11 08:19:45 2019"
-## [1] " scoring treatments Wed Sep 11 08:19:45 2019"
-## [1] "have treatment plan Wed Sep 11 08:19:45 2019"
-## [1] "rescoring complex variables Wed Sep 11 08:19:45 2019"
-## [1] "done rescoring complex variables Wed Sep 11 08:19:46 2019"
+
## [1] "vtreat 1.4.6 inspecting inputs Sun Sep 22 15:17:18 2019"
+## [1] "designing treatments Sun Sep 22 15:17:18 2019"
+## [1] " have initial level statistics Sun Sep 22 15:17:18 2019"
+## [1] " scoring treatments Sun Sep 22 15:17:18 2019"
+## [1] "have treatment plan Sun Sep 22 15:17:18 2019"
+## [1] "rescoring complex variables Sun Sep 22 15:17:18 2019"
+## [1] "done rescoring complex variables Sun Sep 22 15:17:18 2019"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
                                  pruneSig=c() # Note: set this to filter, like 0.05 or 1/nvars
 )
@@ -320,9 +320,9 @@ 

xdat <- vtreat::mkCrossFrameCExperiment(dTrain,'x','y',TRUE, rareCount=0, # Note set this to something larger, like 5 rareSig=c())

-
## [1] "vtreat 1.4.5 start initial treatment design Wed Sep 11 08:19:46 2019"
-## [1] " start cross frame work Wed Sep 11 08:19:46 2019"
-## [1] " vtreat::mkCrossFrameCExperiment done Wed Sep 11 08:19:46 2019"
+
## [1] "vtreat 1.4.6 start initial treatment design Sun Sep 22 15:17:18 2019"
+## [1] " start cross frame work Sun Sep 22 15:17:18 2019"
+## [1] " vtreat::mkCrossFrameCExperiment done Sun Sep 22 15:17:18 2019"
treatments <- xdat$treatments
 print(treatments$scoreFrame)
##   varName varMoves          rsq          sig needsSplit extraModelDegrees
diff --git a/docs/articles/vtreatRareLevels.html b/docs/articles/vtreatRareLevels.html
index 7392407f..bf6d64c5 100644
--- a/docs/articles/vtreatRareLevels.html
+++ b/docs/articles/vtreatRareLevels.html
@@ -30,7 +30,7 @@
       
       
         vtreat
-        1.4.5
+        1.4.6
       
     
 
@@ -113,7 +113,7 @@
       

vtreat Rare Levels

John Mount

-

2019-09-11

+

2019-09-22

Source: vignettes/vtreatRareLevels.Rmd diff --git a/docs/articles/vtreatScaleMode.html b/docs/articles/vtreatScaleMode.html index a736e45c..9de17b8e 100644 --- a/docs/articles/vtreatScaleMode.html +++ b/docs/articles/vtreatScaleMode.html @@ -30,7 +30,7 @@ vtreat - 1.4.5 + 1.4.6 @@ -113,7 +113,7 @@

vtreat scale mode

Win-Vector LLC

-

2019-09-11

+

2019-09-22

Source: vignettes/vtreatScaleMode.Rmd @@ -183,12 +183,12 @@

2019-09-11

slopeFrame$badSlope <- ifelse(is.na(slopeFrame$slope), TRUE, abs(slopeFrame$slope - 1) > 1.e-8) print(slopeFrame)
-
##     varName          mean slope        sig badSlope
-## 1    x_catP  1.850372e-17     1 0.03392101    FALSE
-## 2    x_catB  1.387779e-17     1 1.00000000    FALSE
-## 3  x_lev_NA -6.938894e-18     1 0.20766228    FALSE
-## 4 x_lev_x_a  0.000000e+00     1 0.40972582    FALSE
-## 5 x_lev_x_b  0.000000e+00    NA 1.00000000     TRUE
+
##     varName          mean slope       sig badSlope
+## 1    x_catP  1.850372e-17     1 0.1547700    FALSE
+## 2    x_catB  1.387779e-17     1 0.5160763    FALSE
+## 3  x_lev_NA -6.938894e-18     1 0.2076623    FALSE
+## 4 x_lev_x_a  0.000000e+00     1 0.4097258    FALSE
+## 5 x_lev_x_b  0.000000e+00    NA 1.0000000     TRUE

The above claims are true with the exception of the derived variable x_lev_x.b. This is because the outcome variable y has identical distribution when the original variable x==‘b’ and when x!=‘b’ (on half the time in both cases). This means y is perfectly independent of x==‘b’ and the regression slope must be zero (thus, cannot be 1). vtreat now treats this as needing to scale by a multiplicative factor of zero. Note also that the significance level associated with x_lev_x.b is large, making this variable easy to prune. The varMoves and significance facts in treatmentsC$scoreFrame are about the un-scaled frame (where x_lev_x.b does in fact move).

For a good discussion of the application of y-aware scaling to Principal Components Analysis please see here.

Previous versions of vtreat (0.5.22 and earlier) would copy variables that could not be sensibly scaled into the treated frame unaltered. This was considered the “most faithful” thing to do. However we now feel that this practice was not safe for many downstream procedures, such as principal components analysis and geometric clustering.

@@ -263,9 +263,9 @@

cEraw <- vtreat::mkCrossFrameNExperiment(dTrainN, c('x1','x2','x3'),'y', scale=TRUE) -
## [1] "vtreat 1.4.5 start initial treatment design Wed Sep 11 08:19:52 2019"
-## [1] " start cross frame work Wed Sep 11 08:19:52 2019"
-## [1] " vtreat::mkCrossFrameNExperiment done Wed Sep 11 08:19:52 2019"
+
## [1] "vtreat 1.4.6 start initial treatment design Sun Sep 22 15:17:25 2019"
+## [1] " start cross frame work Sun Sep 22 15:17:25 2019"
+## [1] " vtreat::mkCrossFrameNExperiment done Sun Sep 22 15:17:25 2019"
## [1] "x1" "x2" "x3"
@@ -285,9 +285,9 @@

cEscaled <- vtreat::mkCrossFrameNExperiment(dTrainN, c('x1','x2','x3'),'yScaled', scale=TRUE) -
## [1] "vtreat 1.4.5 start initial treatment design Wed Sep 11 08:19:52 2019"
-## [1] " start cross frame work Wed Sep 11 08:19:52 2019"
-## [1] " vtreat::mkCrossFrameNExperiment done Wed Sep 11 08:19:52 2019"
+
## [1] "vtreat 1.4.6 start initial treatment design Sun Sep 22 15:17:25 2019"
+## [1] " start cross frame work Sun Sep 22 15:17:25 2019"
+## [1] " vtreat::mkCrossFrameNExperiment done Sun Sep 22 15:17:25 2019"
## [1] "x1" "x2" "x3"
diff --git a/docs/articles/vtreatSignificance.html b/docs/articles/vtreatSignificance.html index 97d55cd6..a6c95e80 100644 --- a/docs/articles/vtreatSignificance.html +++ b/docs/articles/vtreatSignificance.html @@ -30,7 +30,7 @@ vtreat - 1.4.5 + 1.4.6 @@ -113,7 +113,7 @@

vtreat significance

John Mount, Nina Zumel

-

2019-09-11

+

2019-09-22

Source: vignettes/vtreatSignificance.Rmd @@ -167,13 +167,13 @@

2019-09-11

## 2 FALSE lev002 lev002F ## 252 FALSE lev002 lev002F
treatmentsC <- vtreat::designTreatmentsC(d,c('catVarNoise','catVarPerfect'),'y',TRUE)
-
## [1] "vtreat 1.4.5 inspecting inputs Wed Sep 11 08:19:54 2019"
-## [1] "designing treatments Wed Sep 11 08:19:54 2019"
-## [1] " have initial level statistics Wed Sep 11 08:19:54 2019"
-## [1] " scoring treatments Wed Sep 11 08:19:54 2019"
-## [1] "have treatment plan Wed Sep 11 08:19:54 2019"
-## [1] "rescoring complex variables Wed Sep 11 08:19:54 2019"
-## [1] "done rescoring complex variables Wed Sep 11 08:19:54 2019"
+
## [1] "vtreat 1.4.6 inspecting inputs Sun Sep 22 15:17:28 2019"
+## [1] "designing treatments Sun Sep 22 15:17:28 2019"
+## [1] " have initial level statistics Sun Sep 22 15:17:28 2019"
+## [1] " scoring treatments Sun Sep 22 15:17:28 2019"
+## [1] "have treatment plan Sun Sep 22 15:17:28 2019"
+## [1] "rescoring complex variables Sun Sep 22 15:17:28 2019"
+## [1] "done rescoring complex variables Sun Sep 22 15:17:28 2019"
# Estimate effect significance (not coefficient significance).
 estSigGLM <- function(xVar,yVar,numberOfHiddenDegrees=0) {
   d <- data.frame(x=xVar,y=yVar,stringsAsFactors = FALSE)
diff --git a/docs/articles/vtreatSplitting.html b/docs/articles/vtreatSplitting.html
index c7254d53..5109a087 100644
--- a/docs/articles/vtreatSplitting.html
+++ b/docs/articles/vtreatSplitting.html
@@ -30,7 +30,7 @@
       
       
         vtreat
-        1.4.5
+        1.4.6
       
     
@@ -113,7 +113,7 @@

vtreat splitting

John Mount, Nina Zumel

-

2019-09-11

+

2019-09-22

Source: vignettes/vtreatSplitting.Rmd @@ -262,26 +262,26 @@

## overlap
## [[1]]
 ## [[1]]$train
-## [1] 1 3 4 5
+## [1] 1 2 4 5
 ## 
 ## [[1]]$app
-## [1] 2
+## [1] 3
 ## 
 ## 
 ## [[2]]
 ## [[2]]$train
-## [1] 2 3 4
+## [1] 1 2 3
 ## 
 ## [[2]]$app
-## [1] 5 1
+## [1] 5 4
 ## 
 ## 
 ## [[3]]
 ## [[3]]$train
-## [1] 1 2 5
+## [1] 3 4 5
 ## 
 ## [[3]]$app
-## [1] 3 4
+## [1] 2 1
 ## 
 ## 
 ## attr(,"splitmethod")
diff --git a/docs/articles/vtreatVariableTypes.html b/docs/articles/vtreatVariableTypes.html
index e03b9046..cd3ef55d 100644
--- a/docs/articles/vtreatVariableTypes.html
+++ b/docs/articles/vtreatVariableTypes.html
@@ -30,7 +30,7 @@
       
       
         vtreat
-        1.4.5
+        1.4.6
       
     
 
@@ -113,7 +113,7 @@
       

Variable Types

Win-Vector LLC

-

2019-09-11

+

2019-09-22

Source: vignettes/vtreatVariableTypes.Rmd @@ -145,18 +145,18 @@

z=c(1,2,3,4,NA,6),y=c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE), stringsAsFactors = FALSE) treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE)

-
## [1] "vtreat 1.4.5 inspecting inputs Wed Sep 11 08:19:58 2019"
-## [1] "designing treatments Wed Sep 11 08:19:58 2019"
-## [1] " have initial level statistics Wed Sep 11 08:19:58 2019"
-## [1] " scoring treatments Wed Sep 11 08:19:59 2019"
-## [1] "have treatment plan Wed Sep 11 08:19:59 2019"
-## [1] "rescoring complex variables Wed Sep 11 08:19:59 2019"
-## [1] "done rescoring complex variables Wed Sep 11 08:19:59 2019"
+
## [1] "vtreat 1.4.6 inspecting inputs Sun Sep 22 15:17:33 2019"
+## [1] "designing treatments Sun Sep 22 15:17:33 2019"
+## [1] " have initial level statistics Sun Sep 22 15:17:33 2019"
+## [1] " scoring treatments Sun Sep 22 15:17:33 2019"
+## [1] "have treatment plan Sun Sep 22 15:17:33 2019"
+## [1] "rescoring complex variables Sun Sep 22 15:17:33 2019"
+## [1] "done rescoring complex variables Sun Sep 22 15:17:33 2019"
scoreColsToPrint <- c('origName','varName','code','rsq','sig','extraModelDegrees')
 print(treatmentsC$scoreFrame[,scoreColsToPrint])
##   origName   varName  code        rsq       sig extraModelDegrees
-## 1        x    x_catP  catP 0.24340634 0.1547700                 2
-## 2        x    x_catB  catB 0.05070201 0.5160763                 2
+## 1        x    x_catP  catP 0.11457614 0.3289524                 2
+## 2        x    x_catB  catB 0.12081050 0.3161341                 2
 ## 3        z         z clean 0.25792985 0.1429977                 0
 ## 4        z   z_isBAD isBAD 0.19087450 0.2076623                 0
 ## 5        x  x_lev_NA   lev 0.19087450 0.2076623                 0
@@ -172,7 +172,7 @@ 

# Map significances back to original variables
 aggregate(sig~origName,data=treatmentsC$scoreFrame,FUN=min)
##   origName       sig
-## 1        x 0.1547700
+## 1        x 0.2076623
 ## 2        z 0.1429977

In the scoreFrame the sig column is the significance of the single variable logistic regression using the named variable (plus a constant term), and the rsq column is the “pseudo-r-squared” or portion of deviance explained (please see here for some notes).

Essentially a derived variable name is built by concatenating an original variable name and a treatment type (also recorded in the code column for convenience). The codes give the different ‘vtreat’ variable types (or really meanings, as all derived variables are numeric).

@@ -199,23 +199,23 @@

z=c(1,2,3,4,NA,6),y=as.numeric(c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)), stringsAsFactors = FALSE) treatmentsN <- designTreatmentsN(dTrainN,colnames(dTrainN),'y')

-
## [1] "vtreat 1.4.5 inspecting inputs Wed Sep 11 08:19:59 2019"
-## [1] "designing treatments Wed Sep 11 08:19:59 2019"
-## [1] " have initial level statistics Wed Sep 11 08:19:59 2019"
-## [1] " scoring treatments Wed Sep 11 08:19:59 2019"
-## [1] "have treatment plan Wed Sep 11 08:19:59 2019"
-## [1] "rescoring complex variables Wed Sep 11 08:19:59 2019"
-## [1] "done rescoring complex variables Wed Sep 11 08:19:59 2019"
+
## [1] "vtreat 1.4.6 inspecting inputs Sun Sep 22 15:17:33 2019"
+## [1] "designing treatments Sun Sep 22 15:17:33 2019"
+## [1] " have initial level statistics Sun Sep 22 15:17:33 2019"
+## [1] " scoring treatments Sun Sep 22 15:17:33 2019"
+## [1] "have treatment plan Sun Sep 22 15:17:33 2019"
+## [1] "rescoring complex variables Sun Sep 22 15:17:33 2019"
+## [1] "done rescoring complex variables Sun Sep 22 15:17:33 2019"
print(treatmentsN$scoreFrame[,scoreColsToPrint])
-
##   origName   varName  code          rsq       sig extraModelDegrees
-## 1        x    x_catP  catP 4.385965e-01 0.1518345                 2
-## 2        x    x_catN  catN 1.110223e-16 1.0000000                 2
-## 3        x    x_catD  catD 1.111111e-01 0.5185185                 2
-## 4        z         z clean 3.045045e-01 0.2562868                 0
-## 5        z   z_isBAD isBAD 2.000000e-01 0.3739010                 0
-## 6        x  x_lev_NA   lev 2.000000e-01 0.3739010                 0
-## 7        x x_lev_x_a   lev 1.111111e-01 0.5185185                 0
-## 8        x x_lev_x_b   lev 0.000000e+00 1.0000000                 0
+
##   origName   varName  code       rsq       sig extraModelDegrees
+## 1        x    x_catP  catP 0.2857143 0.2745766                 2
+## 2        x    x_catN  catN 0.1052632 0.5304117                 2
+## 3        x    x_catD  catD 0.1111111 0.5185185                 2
+## 4        z         z clean 0.3045045 0.2562868                 0
+## 5        z   z_isBAD isBAD 0.2000000 0.3739010                 0
+## 6        x  x_lev_NA   lev 0.2000000 0.3739010                 0
+## 7        x x_lev_x_a   lev 0.1111111 0.5185185                 0
+## 8        x x_lev_x_b   lev 0.0000000 1.0000000                 0

The treatment of numeric targets is similar to that of categorical targets. In the numeric case the possible derived variable types are: