diff --git a/DESCRIPTION b/DESCRIPTION index 0e71ccc8..10b3aec0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,8 @@ Suggests: knitr, parallel, rmarkdown, - dplyr + dplyr, + ggplot2 LazyData: true VignetteBuilder: knitr RoxygenNote: 5.0.1 diff --git a/cran-comments.md b/cran-comments.md index 88a7e2ab..c232dfda 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -7,25 +7,8 @@ * using R version 3.3.1 (2016-06-21) -- "Bug in Your Hair" * using platform: x86_64-apple-darwin14.5.0 (64-bit) -There were no ERRORs, or WARNINGSs. - -There was one NOTE: - - * checking for unstated dependencies in vignettes ... NOTE - '::' or ':::' import not declared from: ‘caret’ - 'library' or 'require' calls not declared from: - ‘caret’ ‘ggplot2’ - - -Both of the above calls are guarded by requireNamespace calls are are -there to illustrate how a user can use the additional caret or ggplot2 -packages. caret and ggplot2 are not used (even optionally) in the -vtreat package, other than being also present in some unit tests -(again optional and to test things for users using caret or -data.table). These packages are deliberately not in suggests as the -vtreat package does not use them or even optionally alter its behavior -if they are available. - +R CMD check results +0 errors | 0 warnings | 0 notes Note_to_CRAN_maintainers diff --git a/inst/doc/vtreat.html b/inst/doc/vtreat.html index 2c4d9668..cce0aa6e 100644 --- a/inst/doc/vtreat.html +++ b/inst/doc/vtreat.html @@ -12,7 +12,7 @@ - + vtreat package @@ -70,7 +70,7 @@

vtreat package

John Mount, Nina Zumel

-

2016-06-27

+

2016-07-11

@@ -138,13 +138,13 @@

A Categorical Outcome Example

## 3 c 30 ## 4 <NA> NA
treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE)
-
## [1] "desigining treatments Mon Jun 27 08:57:28 2016"
-## [1] "design var x Mon Jun 27 08:57:28 2016"
-## [1] "design var z Mon Jun 27 08:57:28 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:28 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:28 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:28 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:28 2016"
+
## [1] "desigining treatments Mon Jul 11 10:06:51 2016"
+## [1] "design var x Mon Jul 11 10:06:51 2016"
+## [1] "design var z Mon Jul 11 10:06:51 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:51 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:51 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:51 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:51 2016"
print(treatmentsC)
## $treatments
 ## $treatments[[1]]
@@ -184,15 +184,18 @@ 

A Categorical Outcome Example

## $outcomename ## [1] "y" ## +## $vtreatVersion +## [1] '0.5.26' +## +## $splitmethod +## [1] "oneway" +## ## $outcomeTarget ## [1] TRUE ## ## $outcomeType ## [1] "Binary" ## -## $vtreatVersion -## [1] '0.5.26' -## ## attr(,"class") ## [1] "treatmentplan"
print(treatmentsC$treatments[[1]])
@@ -259,13 +262,13 @@

A Numeric Outcome Example

## 3 c 30 ## 4 <NA> NA
treatmentsN = designTreatmentsN(dTrainN,colnames(dTrainN),'y')
-
## [1] "desigining treatments Mon Jun 27 08:57:28 2016"
-## [1] "design var x Mon Jun 27 08:57:28 2016"
-## [1] "design var z Mon Jun 27 08:57:28 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:28 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:28 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:28 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:28 2016"
+
## [1] "desigining treatments Mon Jul 11 10:06:51 2016"
+## [1] "design var x Mon Jul 11 10:06:51 2016"
+## [1] "design var z Mon Jul 11 10:06:51 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:51 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:51 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:51 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:51 2016"
print(treatmentsN)
## $treatments
 ## $treatments[[1]]
@@ -310,12 +313,15 @@ 

A Numeric Outcome Example

## $outcomename ## [1] "y" ## -## $outcomeType -## [1] "Numeric" -## ## $vtreatVersion ## [1] '0.5.26' ## +## $splitmethod +## [1] "oneway" +## +## $outcomeType +## [1] "Numeric" +## ## attr(,"class") ## [1] "treatmentplan"
dTrainNTreated <- prepare(treatmentsN,dTrainN,
diff --git a/inst/doc/vtreatCrossFrames.html b/inst/doc/vtreatCrossFrames.html
index 8768fdb9..215614eb 100644
--- a/inst/doc/vtreatCrossFrames.html
+++ b/inst/doc/vtreatCrossFrames.html
@@ -12,7 +12,7 @@
 
 
 
-
+
 
 vtreat cross frames
 
@@ -70,7 +70,7 @@
 
 

vtreat cross frames

John Mount, Nina Zumel

-

2016-06-27

+

2016-07-11

@@ -117,16 +117,16 @@

The Wrong Way

'y',TRUE, rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem )
-
## [1] "desigining treatments Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad1 Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad2 Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad3 Mon Jun 27 08:57:29 2016"
-## [1] "design var xGood1 Mon Jun 27 08:57:29 2016"
-## [1] "design var xGood2 Mon Jun 27 08:57:29 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:29 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:29 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:29 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:29 2016"
+
## [1] "desigining treatments Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad1 Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad2 Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad3 Mon Jul 11 10:06:52 2016"
+## [1] "design var xGood1 Mon Jul 11 10:06:52 2016"
+## [1] "design var xGood2 Mon Jul 11 10:06:52 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:52 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:52 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:52 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:52 2016"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
   pruneSig=c() # Note: usually want pruneSig to be a small fraction, setting to null to illustrate problems
 )
@@ -195,16 +195,16 @@ 

The Right Way: A Calibration Set

'y',TRUE, rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem )
-
## [1] "desigining treatments Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad1 Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad2 Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad3 Mon Jun 27 08:57:29 2016"
-## [1] "design var xGood1 Mon Jun 27 08:57:30 2016"
-## [1] "design var xGood2 Mon Jun 27 08:57:30 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:30 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:30 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:30 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:30 2016"
+
## [1] "desigining treatments Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad1 Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad2 Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad3 Mon Jul 11 10:06:52 2016"
+## [1] "design var xGood1 Mon Jul 11 10:06:53 2016"
+## [1] "design var xGood2 Mon Jul 11 10:06:53 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:53 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:53 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:53 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:53 2016"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
   pruneSig=pruneSig)
 newvars <- setdiff(colnames(dTrainTreated),'y')
diff --git a/inst/doc/vtreatGrouping.R b/inst/doc/vtreatGrouping.R
index ad805739..30a6745a 100644
--- a/inst/doc/vtreatGrouping.R
+++ b/inst/doc/vtreatGrouping.R
@@ -2,6 +2,9 @@
 knitr::opts_chunk$set(fig.width = 7)
 
 ## ----echo=FALSE, message=FALSE, warning=FALSE----------------------------
+library(vtreat)
+set.seed(23255)
+
 have_ggplot = requireNamespace("ggplot2", quietly=TRUE)
 have_dplyr = requireNamespace("dplyr", quietly=TRUE)
 if(have_ggplot) {
@@ -11,11 +14,7 @@ if(have_dplyr) {
   library(dplyr)
 }
 
-
-library(vtreat)
-set.seed(23255)
-
-## ----functions, echo=FALSE-----------------------------------------------
+## ----echo=FALSE, message=FALSE, warning=FALSE----------------------------
 #
 # takes the frame (d) and the outcome column (d$conc)
 # from the global environment
@@ -49,8 +48,6 @@ showGroupingBehavior = function(groupcol, title) {
   }
 }
 
-
-
 ## ----data----------------------------------------------------------------
 # panel data for concentration in multiple subjects 
 d <- datasets::Theoph
diff --git a/inst/doc/vtreatGrouping.Rmd b/inst/doc/vtreatGrouping.Rmd
index 2227b3bd..bcf4afa7 100644
--- a/inst/doc/vtreatGrouping.Rmd
+++ b/inst/doc/vtreatGrouping.Rmd
@@ -14,6 +14,9 @@ knitr::opts_chunk$set(fig.width = 7)
 ```
 
 ```{r echo=FALSE, message=FALSE, warning=FALSE}
+library(vtreat)
+set.seed(23255)
+
 have_ggplot = requireNamespace("ggplot2", quietly=TRUE)
 have_dplyr = requireNamespace("dplyr", quietly=TRUE)
 if(have_ggplot) {
@@ -22,15 +25,9 @@ if(have_ggplot) {
 if(have_dplyr) {
   library(dplyr)
 }
-
-
-library(vtreat)
-set.seed(23255)
 ```
 
-This vignette shows an example use of _y_-stratified sampling with a grouping restriction in `vtreat`.
-
-```{r functions, echo=FALSE}
+```{r echo=FALSE, message=FALSE, warning=FALSE}
 #
 # takes the frame (d) and the outcome column (d$conc)
 # from the global environment
@@ -63,10 +60,10 @@ showGroupingBehavior = function(groupcol, title) {
     print(plt)
   }
 }
-
-
 ```
 
+This vignette shows an example use of _y_-stratified sampling with a grouping restriction in `vtreat`.
+
 For this example, we will use the `Theosph` dataset: data from an experiment on the pharmacokinetics of theophylline. We will demonstrate the desired effects of _y_-stratification while also respecting a grouping constraint. 
 
 ## The Data
diff --git a/inst/doc/vtreatGrouping.html b/inst/doc/vtreatGrouping.html
index 0c48056d..6a3b4d82 100644
--- a/inst/doc/vtreatGrouping.html
+++ b/inst/doc/vtreatGrouping.html
@@ -12,7 +12,7 @@
 
 
 
-
+
 
 Grouping Example
 
@@ -70,7 +70,7 @@
 
 

Grouping Example

Nina Zumel, Nate Sutton

-

2016-06-27

+

2016-07-11

@@ -150,18 +150,18 @@

Partitioning the Data for Modeling

print(table(Subject=d$Subject, groupid=d$stratSplit))
##        groupid
 ## Subject 1 2 3
-##      6  4 4 3
-##      7  4 6 1
-##      8  3 4 4
-##      11 1 5 5
-##      3  3 2 6
-##      2  5 2 4
-##      4  3 3 5
-##      9  5 4 2
-##      12 3 4 4
-##      10 6 2 3
-##      1  2 5 4
-##      5  5 3 3
+## 6 6 3 2 +## 7 6 2 3 +## 8 3 6 2 +## 11 2 3 6 +## 3 4 4 3 +## 2 3 4 4 +## 4 3 4 4 +## 9 3 5 3 +## 12 3 2 6 +## 10 5 3 3 +## 1 3 4 4 +## 5 3 4 4

We can see this partition didn’t preserve the Subject grouping.

Finally, we can try vtreat’s group-preserving split, which also tries to y-stratify as much as possible (by stratifying on the mean y observation from each group).

# stratify by patient and outcome
@@ -175,15 +175,15 @@ 

Partitioning the Data for Modeling

print(table(Subject=d$Subject, groupid=d$subjectSplit))
##        groupid
 ## Subject  1  2  3
-##      6  11  0  0
-##      7   0  0 11
-##      8   0 11  0
+##      6   0 11  0
+##      7  11  0  0
+##      8   0  0 11
 ##      11  0 11  0
-##      3  11  0  0
-##      2   0  0 11
-##      4   0 11  0
-##      9  11  0  0
-##      12  0  0 11
+##      3   0  0 11
+##      2  11  0  0
+##      4  11  0  0
+##      9   0  0 11
+##      12  0 11  0
 ##      10 11  0  0
 ##      1   0 11  0
 ##      5   0  0 11
@@ -203,9 +203,9 @@

Group-preserving, y-stratified Partition

## [1] "Group by patient, stratify on y"
 ## [1] "Group means:"
 ##        1        2        3 
-## 4.859091 5.040455 4.981818 
-## [1] "Standard deviation of group means: 0.0925499641694611"
-

+## 4.901364 4.971364 5.008636 +## [1] "Standard deviation of group means: 0.0544620574572734" +

diff --git a/inst/doc/vtreatOverfit.html b/inst/doc/vtreatOverfit.html index e38f3425..94712729 100644 --- a/inst/doc/vtreatOverfit.html +++ b/inst/doc/vtreatOverfit.html @@ -12,7 +12,7 @@ - + vtreat overfit @@ -70,7 +70,7 @@

vtreat overfit

John Mount, Nina Zumel

-

2016-06-27

+

2016-07-11

@@ -96,12 +96,12 @@

Bad Practice: Using the same data to treat and to train

treatments <- vtreat::designTreatmentsC(dTrain,'x','y',TRUE, rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem ) -
## [1] "desigining treatments Mon Jun 27 08:57:33 2016"
-## [1] "design var x Mon Jun 27 08:57:33 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:33 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:33 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:33 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:33 2016"
+
## [1] "desigining treatments Mon Jul 11 10:06:56 2016"
+## [1] "design var x Mon Jul 11 10:06:56 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:56 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:56 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:56 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:56 2016"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
   pruneSig=c() # Note: usually want pruneSig to be a small fraction, setting to null to illustrate problem
 )
@@ -193,12 +193,12 @@ 

Correct Practice: Use different data to treat and train

rareCount=0, # Note set this to something larger, like 5 rareSig=c() # Note set this to something like 0.3 )
-
## [1] "desigining treatments Mon Jun 27 08:57:33 2016"
-## [1] "design var x Mon Jun 27 08:57:33 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:34 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:34 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:34 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:34 2016"
+
## [1] "desigining treatments Mon Jul 11 10:06:56 2016"
+## [1] "design var x Mon Jul 11 10:06:56 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:56 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:56 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:56 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:56 2016"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
                                  pruneSig=c() # Note: set this to filter, like 0.05 or 1/nvars
 )
diff --git a/inst/doc/vtreatScaleMode.html b/inst/doc/vtreatScaleMode.html
index e3207bb9..b7439c33 100644
--- a/inst/doc/vtreatScaleMode.html
+++ b/inst/doc/vtreatScaleMode.html
@@ -12,7 +12,7 @@
 
 
 
-
+
 
 vtreat scale mode
 
@@ -70,7 +70,7 @@
 
 

vtreat scale mode

Win-Vector LLC

-

2016-06-27

+

2016-07-11

diff --git a/inst/doc/vtreatSignificance.html b/inst/doc/vtreatSignificance.html index 6d59e22a..26c8d57c 100644 --- a/inst/doc/vtreatSignificance.html +++ b/inst/doc/vtreatSignificance.html @@ -12,7 +12,7 @@ - + vtreat significance @@ -70,7 +70,7 @@

vtreat significance

John Mount, Nina Zumel

-

2016-06-27

+

2016-07-11

@@ -119,13 +119,13 @@

2016-06-27

## 2 FALSE lev002 lev002F ## 252 FALSE lev002 lev002F
treatmentsC <- vtreat::designTreatmentsC(d,c('catVarNoise','catVarPerfect'),'y',TRUE)
-
## [1] "desigining treatments Mon Jun 27 08:57:35 2016"
-## [1] "design var catVarNoise Mon Jun 27 08:57:35 2016"
-## [1] "design var catVarPerfect Mon Jun 27 08:57:35 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:35 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:35 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:35 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:35 2016"
+
## [1] "desigining treatments Mon Jul 11 10:06:57 2016"
+## [1] "design var catVarNoise Mon Jul 11 10:06:57 2016"
+## [1] "design var catVarPerfect Mon Jul 11 10:06:57 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:57 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:57 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:57 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:58 2016"
# Estimate effect significance (not coeficient significance).
 estSigGLM <- function(xVar,yVar,numberOfHiddenDegrees=0) {
   d <- data.frame(x=xVar,y=yVar,stringsAsFactors = FALSE)
diff --git a/inst/doc/vtreatSplitting.R b/inst/doc/vtreatSplitting.R
index cb7e71c5..dfe241e1 100644
--- a/inst/doc/vtreatSplitting.R
+++ b/inst/doc/vtreatSplitting.R
@@ -5,26 +5,10 @@ knitr::opts_chunk$set(fig.width = 7)
 ## ------------------------------------------------------------------------
 vtreat::oneWayHoldout(3,NULL,NULL,NULL)
 
-## ------------------------------------------------------------------------
-splitFn <- function(nRows,nSplits,dframe,y) {
-  if(requireNamespace("caret",quietly=TRUE)) {
-    fullSeq <- seq_len(nRows)
-    part <- caret::createFolds(y=y,k=nSplits)
-    lapply(part,
-           function(appi) { 
-             list(train=setdiff(fullSeq,appi),app=appi)
-           })
-  } else {
-    NULL # fall back to vtreat implementation
-  }
-}
-
-## ------------------------------------------------------------------------
-vtreat::buildEvalSets(25,y=1:25,splitFunction=splitFn)
-
 ## ----warning=FALSE-------------------------------------------------------
 library('vtreat')
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+haveGGPlot2 <- requireNamespace("ggplot2",quietly=TRUE)
+if(haveGGPlot2) {
   library('ggplot2')
 }
 
@@ -58,7 +42,7 @@ d$simpleGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pSimple)
 tapply(d$y,d$simpleGroup,mean)
 # standard error of mean(y)
 sd(tapply(d$y,d$simpleGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
   # plot the distribution of y in each fold
   ggplot(data=d,aes(x=y,color=as.factor(simpleGroup))) + 
     geom_density() + ggtitle('simple (unstratified) grouping')
@@ -67,7 +51,7 @@ if(requireNamespace("ggplot2",quietly=TRUE)) {
 tapply(d$y,d$stratGroup,mean)
 # standard error of mean(y)
 sd(tapply(d$y,d$stratGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
   # plot the distribution of y in each fold
   ggplot(data=d,aes(x=y,color=as.factor(stratGroup))) + 
     geom_density() + ggtitle('y-stratified grouping')
diff --git a/inst/doc/vtreatSplitting.Rmd b/inst/doc/vtreatSplitting.Rmd
index 2d5dac42..8358e904 100644
--- a/inst/doc/vtreatSplitting.Rmd
+++ b/inst/doc/vtreatSplitting.Rmd
@@ -87,30 +87,7 @@ As we can see `vtreat::oneWayHoldout` builds three split sets where in each set
 
 The function `buildEvalSets` takes one of the above splitting functions as input and returns a cross-validation plan that instantiates the desired splitting, while also guarding against corner cases. You can also explicitly specify the splitting plan when designing a vtreat variable treatment plan using `designTreatments[N\C]` or `mkCrossFrame[N\C]Experiment`.
 
-For issues beyond stratification the user may want to supply their own splitting plan.  For example to wrap [`caret::createFolds`](http://topepo.github.io/caret/index.html) as a splitting function we would write the following function definition.
-
-```{r}
-splitFn <- function(nRows,nSplits,dframe,y) {
-  if(requireNamespace("caret",quietly=TRUE)) {
-    fullSeq <- seq_len(nRows)
-    part <- caret::createFolds(y=y,k=nSplits)
-    lapply(part,
-           function(appi) { 
-             list(train=setdiff(fullSeq,appi),app=appi)
-           })
-  } else {
-    NULL # fall back to vtreat implementation
-  }
-}
-```
-
-This function can then be passed into any `vtreat` operation that takes a `splitFunction` argument (such as `mkCrossFrameNExperiment`, `designTreatmentsN`, and many more).  For example we can pass the user defined `splitFn` into `vtreat::buildEvalSets` as follows:
-
-```{r}
-vtreat::buildEvalSets(25,y=1:25,splitFunction=splitFn)
-```
-
-As stated above, the vtreat library code will try to use the user function for splitting, but will fall back to an appropriate vtreat function in corner cases that the user function may not handle (for example, too few rows, too few groups, and so on). Thus the user code can assume it is in a reasonable situation (and even safely return NULL if it can’t deal with the situation it is given).
+For issues beyond stratification the user may want to supply their own splitting plan. Such a function can then be passed into any `vtreat` operation that takes a `splitFunction` argument (such as `mkCrossFrameNExperiment`, `designTreatmentsN`, and many more).  For example we can pass the user defined `splitFn` into `vtreat::buildEvalSets` as follows:
 
 The file [outOfSample.R](https://github.com/WinVector/vtreat/blob/master/R/outOfSample.R) is full of worked examples.  In particular we would suggest running the code displayed when you type any of:
 
@@ -124,10 +101,12 @@ For example from `help(kWayStratifiedY)` we can see that the distribution of `y`
 
 ```{r warning=FALSE}
 library('vtreat')
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+haveGGPlot2 <- requireNamespace("ggplot2",quietly=TRUE)
+if(haveGGPlot2) {
   library('ggplot2')
 }
 ```
+
 ```{r}
 set.seed(23255)
 d <- data.frame(y=sin(1:100))
@@ -158,7 +137,7 @@ d$simpleGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pSimple)
 tapply(d$y,d$simpleGroup,mean)
 # standard error of mean(y)
 sd(tapply(d$y,d$simpleGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
   # plot the distribution of y in each fold
   ggplot(data=d,aes(x=y,color=as.factor(simpleGroup))) + 
     geom_density() + ggtitle('simple (unstratified) grouping')
@@ -167,7 +146,7 @@ if(requireNamespace("ggplot2",quietly=TRUE)) {
 tapply(d$y,d$stratGroup,mean)
 # standard error of mean(y)
 sd(tapply(d$y,d$stratGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
   # plot the distribution of y in each fold
   ggplot(data=d,aes(x=y,color=as.factor(stratGroup))) + 
     geom_density() + ggtitle('y-stratified grouping')
diff --git a/inst/doc/vtreatSplitting.html b/inst/doc/vtreatSplitting.html
index 4eaa5bba..1f1d10af 100644
--- a/inst/doc/vtreatSplitting.html
+++ b/inst/doc/vtreatSplitting.html
@@ -12,7 +12,7 @@
 
 
 
-
+
 
 vtreat splitting
 
@@ -70,7 +70,7 @@
 
 

vtreat splitting

John Mount, Nina Zumel

-

2016-06-27

+

2016-07-11

@@ -157,48 +157,7 @@

Examples

  • makekWayCrossValidationGroupedByColumn: k-way y-stratified cross-validation that preserves grouping (for example, all rows corresponding to a single customer or patient, etc). This is a complex splitting plan, and only recommended when absolutely needed.
  • The function buildEvalSets takes one of the above splitting functions as input and returns a cross-validation plan that instantiates the desired splitting, while also guarding against corner cases. You can also explicitly specify the splitting plan when designing a vtreat variable treatment plan using designTreatments[N\C] or mkCrossFrame[N\C]Experiment.

    -

    For issues beyond stratification the user may want to supply their own splitting plan. For example to wrap caret::createFolds as a splitting function we would write the following function definition.

    -
    splitFn <- function(nRows,nSplits,dframe,y) {
    -  if(requireNamespace("caret",quietly=TRUE)) {
    -    fullSeq <- seq_len(nRows)
    -    part <- caret::createFolds(y=y,k=nSplits)
    -    lapply(part,
    -           function(appi) { 
    -             list(train=setdiff(fullSeq,appi),app=appi)
    -           })
    -  } else {
    -    NULL # fall back to vtreat implementation
    -  }
    -}
    -

    This function can then be passed into any vtreat operation that takes a splitFunction argument (such as mkCrossFrameNExperiment, designTreatmentsN, and many more). For example we can pass the user defined splitFn into vtreat::buildEvalSets as follows:

    -
    vtreat::buildEvalSets(25,y=1:25,splitFunction=splitFn)
    -
    ## $Fold1
    -## $Fold1$train
    -##  [1]  1  3  4  6  7 10 11 12 13 15 16 17 18 20 23 24 25
    -## 
    -## $Fold1$app
    -## [1]  2  5  8  9 14 19 21 22
    -## 
    -## 
    -## $Fold2
    -## $Fold2$train
    -##  [1]  2  5  6  7  8  9 10 13 14 17 18 19 20 21 22 24
    -## 
    -## $Fold2$app
    -## [1]  1  3  4 11 12 15 16 23 25
    -## 
    -## 
    -## $Fold3
    -## $Fold3$train
    -##  [1]  1  2  3  4  5  8  9 11 12 14 15 16 19 21 22 23 25
    -## 
    -## $Fold3$app
    -## [1]  6  7 10 13 17 18 20 24
    -## 
    -## 
    -## attr(,"splitmethod")
    -## [1] "userfunction"
    -

    As stated above, the vtreat library code will try to use the user function for splitting, but will fall back to an appropriate vtreat function in corner cases that the user function may not handle (for example, too few rows, too few groups, and so on). Thus the user code can assume it is in a reasonable situation (and even safely return NULL if it can’t deal with the situation it is given).

    +

    For issues beyond stratification the user may want to supply their own splitting plan. Such a function can then be passed into any vtreat operation that takes a splitFunction argument (such as mkCrossFrameNExperiment, designTreatmentsN, and many more). For example we can pass the user defined splitFn into vtreat::buildEvalSets as follows:

    The file outOfSample.R is full of worked examples. In particular we would suggest running the code displayed when you type any of:

    • help(oneWayHoldout)
    • @@ -208,7 +167,8 @@

      Examples

    For example from help(kWayStratifiedY) we can see that the distribution of y is much more similar in each fold when we stratify than when we don’t:

    library('vtreat')
    -if(requireNamespace("ggplot2",quietly=TRUE)) {
    +haveGGPlot2 <- requireNamespace("ggplot2",quietly=TRUE)
    +if(haveGGPlot2) {
       library('ggplot2')
     }
    set.seed(23255)
    @@ -245,7 +205,7 @@ 

    Examples

    # standard error of mean(y)
     sd(tapply(d$y,d$simpleGroup,mean))
    ## [1] 0.1019753
    -
    if(requireNamespace("ggplot2",quietly=TRUE)) {
    +
    if(haveGGPlot2) {
       # plot the distribution of y in each fold
       ggplot(data=d,aes(x=y,color=as.factor(simpleGroup))) + 
         geom_density() + ggtitle('simple (unstratified) grouping')
    @@ -258,7 +218,7 @@ 

    Examples

    # standard error of mean(y)
     sd(tapply(d$y,d$stratGroup,mean))
    ## [1] 0.01141606
    -
    if(requireNamespace("ggplot2",quietly=TRUE)) {
    +
    if(haveGGPlot2) {
       # plot the distribution of y in each fold
       ggplot(data=d,aes(x=y,color=as.factor(stratGroup))) + 
         geom_density() + ggtitle('y-stratified grouping')
    diff --git a/inst/doc/vtreatVariableTypes.html b/inst/doc/vtreatVariableTypes.html
    index e305307e..70fa2ca3 100644
    --- a/inst/doc/vtreatVariableTypes.html
    +++ b/inst/doc/vtreatVariableTypes.html
    @@ -12,7 +12,7 @@
     
     
     
    -
    +
     
     Variable Types
     
    @@ -70,7 +70,7 @@
     
     

    Variable Types

    Win-Vector LLC

    -

    2016-06-27

    +

    2016-07-11

    @@ -96,13 +96,13 @@

    When the target to predict is categorical

    z=c(1,2,3,4,NA,6),y=c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE), stringsAsFactors = FALSE) treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE)
    -
    ## [1] "desigining treatments Mon Jun 27 08:57:39 2016"
    -## [1] "design var x Mon Jun 27 08:57:39 2016"
    -## [1] "design var z Mon Jun 27 08:57:39 2016"
    -## [1] "scoring treatments Mon Jun 27 08:57:39 2016"
    -## [1] "have treatment plan Mon Jun 27 08:57:39 2016"
    -## [1] "rescoring complex variables Mon Jun 27 08:57:39 2016"
    -## [1] "done rescoring complex variables Mon Jun 27 08:57:39 2016"
    +
    ## [1] "desigining treatments Mon Jul 11 10:06:59 2016"
    +## [1] "design var x Mon Jul 11 10:06:59 2016"
    +## [1] "design var z Mon Jul 11 10:06:59 2016"
    +## [1] "scoring treatments Mon Jul 11 10:06:59 2016"
    +## [1] "have treatment plan Mon Jul 11 10:06:59 2016"
    +## [1] "rescoring complex variables Mon Jul 11 10:06:59 2016"
    +## [1] "done rescoring complex variables Mon Jul 11 10:06:59 2016"
    print(treatmentsC$scoreFrame[,c('origName','varName','code','varMoves','sig')])
    ##   origName   varName  code varMoves        sig
     ## 1        x  x_lev_NA   lev     TRUE 0.20766228
    @@ -142,13 +142,13 @@ 

    When the target to predict is numeric

    z=c(1,2,3,4,NA,6),y=as.numeric(c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)), stringsAsFactors = FALSE) treatmentsN <- designTreatmentsN(dTrainN,colnames(dTrainN),'y')
    -
    ## [1] "desigining treatments Mon Jun 27 08:57:39 2016"
    -## [1] "design var x Mon Jun 27 08:57:39 2016"
    -## [1] "design var z Mon Jun 27 08:57:39 2016"
    -## [1] "scoring treatments Mon Jun 27 08:57:39 2016"
    -## [1] "have treatment plan Mon Jun 27 08:57:39 2016"
    -## [1] "rescoring complex variables Mon Jun 27 08:57:39 2016"
    -## [1] "done rescoring complex variables Mon Jun 27 08:57:39 2016"
    +
    ## [1] "desigining treatments Mon Jul 11 10:06:59 2016"
    +## [1] "design var x Mon Jul 11 10:06:59 2016"
    +## [1] "design var z Mon Jul 11 10:06:59 2016"
    +## [1] "scoring treatments Mon Jul 11 10:06:59 2016"
    +## [1] "have treatment plan Mon Jul 11 10:06:59 2016"
    +## [1] "rescoring complex variables Mon Jul 11 10:06:59 2016"
    +## [1] "done rescoring complex variables Mon Jul 11 10:06:59 2016"
    print(treatmentsN$scoreFrame[,c('origName','varName','code','varMoves','sig')])
    ##   origName   varName  code varMoves       sig
     ## 1        x  x_lev_NA   lev     TRUE 0.3739010
    @@ -178,11 +178,11 @@ 

    When there is no supplied target to predict

    z=c(1,2,3,4,NA,6), stringsAsFactors = FALSE) treatmentsZ <- designTreatmentsZ(dTrainZ,colnames(dTrainZ))
    -
    ## [1] "desigining treatments Mon Jun 27 08:57:39 2016"
    -## [1] "design var x Mon Jun 27 08:57:39 2016"
    -## [1] "design var z Mon Jun 27 08:57:39 2016"
    -## [1] "scoring treatments Mon Jun 27 08:57:39 2016"
    -## [1] "have treatment plan Mon Jun 27 08:57:39 2016"
    +
    ## [1] "desigining treatments Mon Jul 11 10:06:59 2016"
    +## [1] "design var x Mon Jul 11 10:06:59 2016"
    +## [1] "design var z Mon Jul 11 10:06:59 2016"
    +## [1] "scoring treatments Mon Jul 11 10:06:59 2016"
    +## [1] "have treatment plan Mon Jul 11 10:06:59 2016"
    print(treatmentsZ$scoreFrame[,c('origName','varName','code','varMoves')])
    ##   origName varName  code varMoves
     ## 1        x  x_catP  catP     TRUE
    @@ -213,13 +213,13 @@ 

    Overall

    z=c(1,2,3,4,NA,6),y=as.numeric(c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)), stringsAsFactors = FALSE) treatmentsN <- designTreatmentsN(dTrainN,colnames(dTrainN),'y')
    -
    ## [1] "desigining treatments Mon Jun 27 08:57:39 2016"
    -## [1] "design var x Mon Jun 27 08:57:39 2016"
    -## [1] "design var z Mon Jun 27 08:57:39 2016"
    -## [1] "scoring treatments Mon Jun 27 08:57:39 2016"
    -## [1] "have treatment plan Mon Jun 27 08:57:39 2016"
    -## [1] "rescoring complex variables Mon Jun 27 08:57:39 2016"
    -## [1] "done rescoring complex variables Mon Jun 27 08:57:39 2016"
    +
    ## [1] "desigining treatments Mon Jul 11 10:06:59 2016"
    +## [1] "design var x Mon Jul 11 10:06:59 2016"
    +## [1] "design var z Mon Jul 11 10:06:59 2016"
    +## [1] "scoring treatments Mon Jul 11 10:06:59 2016"
    +## [1] "have treatment plan Mon Jul 11 10:06:59 2016"
    +## [1] "rescoring complex variables Mon Jul 11 10:06:59 2016"
    +## [1] "done rescoring complex variables Mon Jul 11 10:06:59 2016"
    print(treatmentsN$scoreFrame[,c('origName','varName','code','varMoves','sig')])
    ##   origName   varName  code varMoves       sig
     ## 1        x  x_lev_NA   lev     TRUE 0.3739010
    diff --git a/tests/testthat/testBO.R b/tests/testthat/testBO.R
    index e8f25198..9a4ee687 100644
    --- a/tests/testthat/testBO.R
    +++ b/tests/testthat/testBO.R
    @@ -91,8 +91,6 @@ test_that("testBO: Works As Expected", {
           dTestNTreated <- prepare(treatmentsN,dTest,pruneSig=0.99,scale=scale)
           dTestNTreated$pred <- predict(modelN,newdata=dTestNTreated)
           if(verbose) {
    -        print(ggplot(data=dTestNTreated,aes(x=pred,y=yN)) + geom_point() +
    -                geom_smooth())
             print(summary(modelN))
           }
           
    @@ -120,7 +118,6 @@ test_that("testBO: Works As Expected", {
           dTestCTreated <- prepare(treatmentsC,dTest,pruneSig=0.99,scale=scale)
           dTestCTreated$pred <- predict(modelC,newdata=dTestCTreated,type='response')
           if(verbose) {
    -        print(ggplot(data=dTestCTreated) + geom_density(aes(x=pred,color=yC)))
             print(summary(modelC))
           }
         }
    diff --git a/tests/testthat/testExpmtDesign.R b/tests/testthat/testExpmtDesign.R
    index 853c2415..0a569472 100644
    --- a/tests/testthat/testExpmtDesign.R
    +++ b/tests/testthat/testExpmtDesign.R
    @@ -89,37 +89,3 @@ test_that("testExpmtDesign: makekWayCrossValidationOrderedByColumn", {
       }
     })
     
    -
    -
    -test_that("testExpmtDesign: cross frame design caret", {
    -  if(requireNamespace("caret",quietly=TRUE)) {
    -    set.seed(2325235)
    -    splitFn <- function(nRows,nSplits,dframe,y) {
    -      fullSeq <- seq_len(nRows)
    -      part <- caret::createFolds(y=y,k=nSplits)
    -      lapply(part,
    -             function(appi) { 
    -               list(train=setdiff(fullSeq,appi),app=appi)
    -             })
    -    }
    -    nrowd = 200
    -    y <- rnorm(nrowd)
    -    eSets <- buildEvalSets(nrowd,y=y,
    -                           splitFunction=splitFn)
    -    expect_true(attr(eSets,'splitmethod')=='userfunction')
    -    fullSeq <- seq_len(nrowd)
    -    expect_true(length(eSets)>0)
    -    for(ei in eSets) {
    -      expect_true(length(ei$train)>0)
    -      expect_true(length(ei$app)>0)
    -      expect_true(all(ei$train %in% fullSeq))
    -      expect_true(all(ei$app %in% fullSeq))
    -    }
    -    apps <- Reduce(c,lapply(eSets,function(ei) ei$app))
    -    expect_true(length(apps)==nrowd)
    -    expect_true(length(unique(apps))==nrowd)
    -    problem <- problemAppPlan(nrowd,3,eSets,TRUE)
    -    expect_true(is.null(problem))
    -  }
    -})
    -
    diff --git a/vignettes/vtreatGrouping.Rmd b/vignettes/vtreatGrouping.Rmd
    index 2227b3bd..bcf4afa7 100644
    --- a/vignettes/vtreatGrouping.Rmd
    +++ b/vignettes/vtreatGrouping.Rmd
    @@ -14,6 +14,9 @@ knitr::opts_chunk$set(fig.width = 7)
     ```
     
     ```{r echo=FALSE, message=FALSE, warning=FALSE}
    +library(vtreat)
    +set.seed(23255)
    +
     have_ggplot = requireNamespace("ggplot2", quietly=TRUE)
     have_dplyr = requireNamespace("dplyr", quietly=TRUE)
     if(have_ggplot) {
    @@ -22,15 +25,9 @@ if(have_ggplot) {
     if(have_dplyr) {
       library(dplyr)
     }
    -
    -
    -library(vtreat)
    -set.seed(23255)
     ```
     
    -This vignette shows an example use of _y_-stratified sampling with a grouping restriction in `vtreat`.
    -
    -```{r functions, echo=FALSE}
    +```{r echo=FALSE, message=FALSE, warning=FALSE}
     #
     # takes the frame (d) and the outcome column (d$conc)
     # from the global environment
    @@ -63,10 +60,10 @@ showGroupingBehavior = function(groupcol, title) {
         print(plt)
       }
     }
    -
    -
     ```
     
    +This vignette shows an example use of _y_-stratified sampling with a grouping restriction in `vtreat`.
    +
     For this example, we will use the `Theosph` dataset: data from an experiment on the pharmacokinetics of theophylline. We will demonstrate the desired effects of _y_-stratification while also respecting a grouping constraint. 
     
     ## The Data
    diff --git a/vignettes/vtreatSplitting.Rmd b/vignettes/vtreatSplitting.Rmd
    index 2d5dac42..8358e904 100644
    --- a/vignettes/vtreatSplitting.Rmd
    +++ b/vignettes/vtreatSplitting.Rmd
    @@ -87,30 +87,7 @@ As we can see `vtreat::oneWayHoldout` builds three split sets where in each set
     
     The function `buildEvalSets` takes one of the above splitting functions as input and returns a cross-validation plan that instantiates the desired splitting, while also guarding against corner cases. You can also explicitly specify the splitting plan when designing a vtreat variable treatment plan using `designTreatments[N\C]` or `mkCrossFrame[N\C]Experiment`.
     
    -For issues beyond stratification the user may want to supply their own splitting plan.  For example to wrap [`caret::createFolds`](http://topepo.github.io/caret/index.html) as a splitting function we would write the following function definition.
    -
    -```{r}
    -splitFn <- function(nRows,nSplits,dframe,y) {
    -  if(requireNamespace("caret",quietly=TRUE)) {
    -    fullSeq <- seq_len(nRows)
    -    part <- caret::createFolds(y=y,k=nSplits)
    -    lapply(part,
    -           function(appi) { 
    -             list(train=setdiff(fullSeq,appi),app=appi)
    -           })
    -  } else {
    -    NULL # fall back to vtreat implementation
    -  }
    -}
    -```
    -
    -This function can then be passed into any `vtreat` operation that takes a `splitFunction` argument (such as `mkCrossFrameNExperiment`, `designTreatmentsN`, and many more).  For example we can pass the user defined `splitFn` into `vtreat::buildEvalSets` as follows:
    -
    -```{r}
    -vtreat::buildEvalSets(25,y=1:25,splitFunction=splitFn)
    -```
    -
    -As stated above, the vtreat library code will try to use the user function for splitting, but will fall back to an appropriate vtreat function in corner cases that the user function may not handle (for example, too few rows, too few groups, and so on). Thus the user code can assume it is in a reasonable situation (and even safely return NULL if it can’t deal with the situation it is given).
    +For issues beyond stratification the user may want to supply their own splitting plan. Such a function can then be passed into any `vtreat` operation that takes a `splitFunction` argument (such as `mkCrossFrameNExperiment`, `designTreatmentsN`, and many more).  For example we can pass the user defined `splitFn` into `vtreat::buildEvalSets` as follows:
     
     The file [outOfSample.R](https://github.com/WinVector/vtreat/blob/master/R/outOfSample.R) is full of worked examples.  In particular we would suggest running the code displayed when you type any of:
     
    @@ -124,10 +101,12 @@ For example from `help(kWayStratifiedY)` we can see that the distribution of `y`
     
     ```{r warning=FALSE}
     library('vtreat')
    -if(requireNamespace("ggplot2",quietly=TRUE)) {
    +haveGGPlot2 <- requireNamespace("ggplot2",quietly=TRUE)
    +if(haveGGPlot2) {
       library('ggplot2')
     }
     ```
    +
     ```{r}
     set.seed(23255)
     d <- data.frame(y=sin(1:100))
    @@ -158,7 +137,7 @@ d$simpleGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pSimple)
     tapply(d$y,d$simpleGroup,mean)
     # standard error of mean(y)
     sd(tapply(d$y,d$simpleGroup,mean))
    -if(requireNamespace("ggplot2",quietly=TRUE)) {
    +if(haveGGPlot2) {
       # plot the distribution of y in each fold
       ggplot(data=d,aes(x=y,color=as.factor(simpleGroup))) + 
         geom_density() + ggtitle('simple (unstratified) grouping')
    @@ -167,7 +146,7 @@ if(requireNamespace("ggplot2",quietly=TRUE)) {
     tapply(d$y,d$stratGroup,mean)
     # standard error of mean(y)
     sd(tapply(d$y,d$stratGroup,mean))
    -if(requireNamespace("ggplot2",quietly=TRUE)) {
    +if(haveGGPlot2) {
       # plot the distribution of y in each fold
       ggplot(data=d,aes(x=y,color=as.factor(stratGroup))) + 
         geom_density() + ggtitle('y-stratified grouping')