diff --git a/DESCRIPTION b/DESCRIPTION index 0e71ccc8..10b3aec0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,8 @@ Suggests: knitr, parallel, rmarkdown, - dplyr + dplyr, + ggplot2 LazyData: true VignetteBuilder: knitr RoxygenNote: 5.0.1 diff --git a/cran-comments.md b/cran-comments.md index 88a7e2ab..c232dfda 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -7,25 +7,8 @@ * using R version 3.3.1 (2016-06-21) -- "Bug in Your Hair" * using platform: x86_64-apple-darwin14.5.0 (64-bit) -There were no ERRORs, or WARNINGSs. - -There was one NOTE: - - * checking for unstated dependencies in vignettes ... NOTE - '::' or ':::' import not declared from: ‘caret’ - 'library' or 'require' calls not declared from: - ‘caret’ ‘ggplot2’ - - -Both of the above calls are guarded by requireNamespace calls are are -there to illustrate how a user can use the additional caret or ggplot2 -packages. caret and ggplot2 are not used (even optionally) in the -vtreat package, other than being also present in some unit tests -(again optional and to test things for users using caret or -data.table). These packages are deliberately not in suggests as the -vtreat package does not use them or even optionally alter its behavior -if they are available. - +R CMD check results +0 errors | 0 warnings | 0 notes Note_to_CRAN_maintainers diff --git a/inst/doc/vtreat.html b/inst/doc/vtreat.html index 2c4d9668..cce0aa6e 100644 --- a/inst/doc/vtreat.html +++ b/inst/doc/vtreat.html @@ -12,7 +12,7 @@ - +
treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE)
## [1] "desigining treatments Mon Jun 27 08:57:28 2016"
-## [1] "design var x Mon Jun 27 08:57:28 2016"
-## [1] "design var z Mon Jun 27 08:57:28 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:28 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:28 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:28 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:28 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:51 2016"
+## [1] "design var x Mon Jul 11 10:06:51 2016"
+## [1] "design var z Mon Jul 11 10:06:51 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:51 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:51 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:51 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:51 2016"
print(treatmentsC)
## $treatments
## $treatments[[1]]
@@ -184,15 +184,18 @@ A Categorical Outcome Example
## $outcomename
## [1] "y"
##
+## $vtreatVersion
+## [1] '0.5.26'
+##
+## $splitmethod
+## [1] "oneway"
+##
## $outcomeTarget
## [1] TRUE
##
## $outcomeType
## [1] "Binary"
##
-## $vtreatVersion
-## [1] '0.5.26'
-##
## attr(,"class")
## [1] "treatmentplan"
print(treatmentsC$treatments[[1]])
treatmentsN = designTreatmentsN(dTrainN,colnames(dTrainN),'y')
## [1] "desigining treatments Mon Jun 27 08:57:28 2016"
-## [1] "design var x Mon Jun 27 08:57:28 2016"
-## [1] "design var z Mon Jun 27 08:57:28 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:28 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:28 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:28 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:28 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:51 2016"
+## [1] "design var x Mon Jul 11 10:06:51 2016"
+## [1] "design var z Mon Jul 11 10:06:51 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:51 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:51 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:51 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:51 2016"
print(treatmentsN)
## $treatments
## $treatments[[1]]
@@ -310,12 +313,15 @@ A Numeric Outcome Example
## $outcomename
## [1] "y"
##
-## $outcomeType
-## [1] "Numeric"
-##
## $vtreatVersion
## [1] '0.5.26'
##
+## $splitmethod
+## [1] "oneway"
+##
+## $outcomeType
+## [1] "Numeric"
+##
## attr(,"class")
## [1] "treatmentplan"
dTrainNTreated <- prepare(treatmentsN,dTrainN,
diff --git a/inst/doc/vtreatCrossFrames.html b/inst/doc/vtreatCrossFrames.html
index 8768fdb9..215614eb 100644
--- a/inst/doc/vtreatCrossFrames.html
+++ b/inst/doc/vtreatCrossFrames.html
@@ -12,7 +12,7 @@
-
+
vtreat cross frames
@@ -70,7 +70,7 @@
vtreat cross frames
John Mount, Nina Zumel
-2016-06-27
+2016-07-11
@@ -117,16 +117,16 @@ The Wrong Way
'y',TRUE,
rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem
)
## [1] "desigining treatments Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad1 Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad2 Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad3 Mon Jun 27 08:57:29 2016"
-## [1] "design var xGood1 Mon Jun 27 08:57:29 2016"
-## [1] "design var xGood2 Mon Jun 27 08:57:29 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:29 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:29 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:29 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:29 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad1 Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad2 Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad3 Mon Jul 11 10:06:52 2016"
+## [1] "design var xGood1 Mon Jul 11 10:06:52 2016"
+## [1] "design var xGood2 Mon Jul 11 10:06:52 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:52 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:52 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:52 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:52 2016"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
pruneSig=c() # Note: usually want pruneSig to be a small fraction, setting to null to illustrate problems
)
@@ -195,16 +195,16 @@ The Right Way: A Calibration Set
'y',TRUE,
rareCount=0 # Note: usually want rareCount>0, setting to zero to illustrate problem
)
## [1] "desigining treatments Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad1 Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad2 Mon Jun 27 08:57:29 2016"
-## [1] "design var xBad3 Mon Jun 27 08:57:29 2016"
-## [1] "design var xGood1 Mon Jun 27 08:57:30 2016"
-## [1] "design var xGood2 Mon Jun 27 08:57:30 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:30 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:30 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:30 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:30 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad1 Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad2 Mon Jul 11 10:06:52 2016"
+## [1] "design var xBad3 Mon Jul 11 10:06:52 2016"
+## [1] "design var xGood1 Mon Jul 11 10:06:53 2016"
+## [1] "design var xGood2 Mon Jul 11 10:06:53 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:53 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:53 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:53 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:53 2016"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
pruneSig=pruneSig)
newvars <- setdiff(colnames(dTrainTreated),'y')
diff --git a/inst/doc/vtreatGrouping.R b/inst/doc/vtreatGrouping.R
index ad805739..30a6745a 100644
--- a/inst/doc/vtreatGrouping.R
+++ b/inst/doc/vtreatGrouping.R
@@ -2,6 +2,9 @@
knitr::opts_chunk$set(fig.width = 7)
## ----echo=FALSE, message=FALSE, warning=FALSE----------------------------
+library(vtreat)
+set.seed(23255)
+
have_ggplot = requireNamespace("ggplot2", quietly=TRUE)
have_dplyr = requireNamespace("dplyr", quietly=TRUE)
if(have_ggplot) {
@@ -11,11 +14,7 @@ if(have_dplyr) {
library(dplyr)
}
-
-library(vtreat)
-set.seed(23255)
-
-## ----functions, echo=FALSE-----------------------------------------------
+## ----echo=FALSE, message=FALSE, warning=FALSE----------------------------
#
# takes the frame (d) and the outcome column (d$conc)
# from the global environment
@@ -49,8 +48,6 @@ showGroupingBehavior = function(groupcol, title) {
}
}
-
-
## ----data----------------------------------------------------------------
# panel data for concentration in multiple subjects
d <- datasets::Theoph
diff --git a/inst/doc/vtreatGrouping.Rmd b/inst/doc/vtreatGrouping.Rmd
index 2227b3bd..bcf4afa7 100644
--- a/inst/doc/vtreatGrouping.Rmd
+++ b/inst/doc/vtreatGrouping.Rmd
@@ -14,6 +14,9 @@ knitr::opts_chunk$set(fig.width = 7)
```
```{r echo=FALSE, message=FALSE, warning=FALSE}
+library(vtreat)
+set.seed(23255)
+
have_ggplot = requireNamespace("ggplot2", quietly=TRUE)
have_dplyr = requireNamespace("dplyr", quietly=TRUE)
if(have_ggplot) {
@@ -22,15 +25,9 @@ if(have_ggplot) {
if(have_dplyr) {
library(dplyr)
}
-
-
-library(vtreat)
-set.seed(23255)
```
-This vignette shows an example use of _y_-stratified sampling with a grouping restriction in `vtreat`.
-
-```{r functions, echo=FALSE}
+```{r echo=FALSE, message=FALSE, warning=FALSE}
#
# takes the frame (d) and the outcome column (d$conc)
# from the global environment
@@ -63,10 +60,10 @@ showGroupingBehavior = function(groupcol, title) {
print(plt)
}
}
-
-
```
+This vignette shows an example use of _y_-stratified sampling with a grouping restriction in `vtreat`.
+
For this example, we will use the `Theosph` dataset: data from an experiment on the pharmacokinetics of theophylline. We will demonstrate the desired effects of _y_-stratification while also respecting a grouping constraint.
## The Data
diff --git a/inst/doc/vtreatGrouping.html b/inst/doc/vtreatGrouping.html
index 0c48056d..6a3b4d82 100644
--- a/inst/doc/vtreatGrouping.html
+++ b/inst/doc/vtreatGrouping.html
@@ -12,7 +12,7 @@
-
+
Grouping Example
@@ -70,7 +70,7 @@
Grouping Example
Nina Zumel, Nate Sutton
-2016-06-27
+2016-07-11
@@ -150,18 +150,18 @@ Partitioning the Data for Modeling
print(table(Subject=d$Subject, groupid=d$stratSplit))
## groupid
## Subject 1 2 3
-## 6 4 4 3
-## 7 4 6 1
-## 8 3 4 4
-## 11 1 5 5
-## 3 3 2 6
-## 2 5 2 4
-## 4 3 3 5
-## 9 5 4 2
-## 12 3 4 4
-## 10 6 2 3
-## 1 2 5 4
-## 5 5 3 3
+## 6 6 3 2
+## 7 6 2 3
+## 8 3 6 2
+## 11 2 3 6
+## 3 4 4 3
+## 2 3 4 4
+## 4 3 4 4
+## 9 3 5 3
+## 12 3 2 6
+## 10 5 3 3
+## 1 3 4 4
+## 5 3 4 4
We can see this partition didn’t preserve the Subject
grouping.
Finally, we can try vtreat
’s group-preserving split, which also tries to y-stratify as much as possible (by stratifying on the mean y observation from each group).
# stratify by patient and outcome
@@ -175,15 +175,15 @@ Partitioning the Data for Modeling
print(table(Subject=d$Subject, groupid=d$subjectSplit))
## groupid
## Subject 1 2 3
-## 6 11 0 0
-## 7 0 0 11
-## 8 0 11 0
+## 6 0 11 0
+## 7 11 0 0
+## 8 0 0 11
## 11 0 11 0
-## 3 11 0 0
-## 2 0 0 11
-## 4 0 11 0
-## 9 11 0 0
-## 12 0 0 11
+## 3 0 0 11
+## 2 11 0 0
+## 4 11 0 0
+## 9 0 0 11
+## 12 0 11 0
## 10 11 0 0
## 1 0 11 0
## 5 0 0 11
@@ -203,9 +203,9 @@ ## [1] "Group by patient, stratify on y"
## [1] "Group means:"
## 1 2 3
-## 4.859091 5.040455 4.981818
-## [1] "Standard deviation of group means: 0.0925499641694611"
-## [1] "desigining treatments Mon Jun 27 08:57:33 2016"
-## [1] "design var x Mon Jun 27 08:57:33 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:33 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:33 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:33 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:33 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:56 2016"
+## [1] "design var x Mon Jul 11 10:06:56 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:56 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:56 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:56 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:56 2016"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
pruneSig=c() # Note: usually want pruneSig to be a small fraction, setting to null to illustrate problem
)
@@ -193,12 +193,12 @@ Correct Practice: Use different data to treat and train
rareCount=0, # Note set this to something larger, like 5
rareSig=c() # Note set this to something like 0.3
)
## [1] "desigining treatments Mon Jun 27 08:57:33 2016"
-## [1] "design var x Mon Jun 27 08:57:33 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:34 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:34 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:34 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:34 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:56 2016"
+## [1] "design var x Mon Jul 11 10:06:56 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:56 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:56 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:56 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:56 2016"
dTrainTreated <- vtreat::prepare(treatments,dTrain,
pruneSig=c() # Note: set this to filter, like 0.05 or 1/nvars
)
diff --git a/inst/doc/vtreatScaleMode.html b/inst/doc/vtreatScaleMode.html
index e3207bb9..b7439c33 100644
--- a/inst/doc/vtreatScaleMode.html
+++ b/inst/doc/vtreatScaleMode.html
@@ -12,7 +12,7 @@
-
+
vtreat scale mode
@@ -70,7 +70,7 @@
vtreat scale mode
Win-Vector LLC
-2016-06-27
+2016-07-11
diff --git a/inst/doc/vtreatSignificance.html b/inst/doc/vtreatSignificance.html
index 6d59e22a..26c8d57c 100644
--- a/inst/doc/vtreatSignificance.html
+++ b/inst/doc/vtreatSignificance.html
@@ -12,7 +12,7 @@
-
+
vtreat significance
@@ -70,7 +70,7 @@
vtreat significance
John Mount, Nina Zumel
-2016-06-27
+2016-07-11
@@ -119,13 +119,13 @@ 2016-06-27
## 2 FALSE lev002 lev002F
## 252 FALSE lev002 lev002F
treatmentsC <- vtreat::designTreatmentsC(d,c('catVarNoise','catVarPerfect'),'y',TRUE)
## [1] "desigining treatments Mon Jun 27 08:57:35 2016"
-## [1] "design var catVarNoise Mon Jun 27 08:57:35 2016"
-## [1] "design var catVarPerfect Mon Jun 27 08:57:35 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:35 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:35 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:35 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:35 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:57 2016"
+## [1] "design var catVarNoise Mon Jul 11 10:06:57 2016"
+## [1] "design var catVarPerfect Mon Jul 11 10:06:57 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:57 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:57 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:57 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:58 2016"
# Estimate effect significance (not coeficient significance).
estSigGLM <- function(xVar,yVar,numberOfHiddenDegrees=0) {
d <- data.frame(x=xVar,y=yVar,stringsAsFactors = FALSE)
diff --git a/inst/doc/vtreatSplitting.R b/inst/doc/vtreatSplitting.R
index cb7e71c5..dfe241e1 100644
--- a/inst/doc/vtreatSplitting.R
+++ b/inst/doc/vtreatSplitting.R
@@ -5,26 +5,10 @@ knitr::opts_chunk$set(fig.width = 7)
## ------------------------------------------------------------------------
vtreat::oneWayHoldout(3,NULL,NULL,NULL)
-## ------------------------------------------------------------------------
-splitFn <- function(nRows,nSplits,dframe,y) {
- if(requireNamespace("caret",quietly=TRUE)) {
- fullSeq <- seq_len(nRows)
- part <- caret::createFolds(y=y,k=nSplits)
- lapply(part,
- function(appi) {
- list(train=setdiff(fullSeq,appi),app=appi)
- })
- } else {
- NULL # fall back to vtreat implementation
- }
-}
-
-## ------------------------------------------------------------------------
-vtreat::buildEvalSets(25,y=1:25,splitFunction=splitFn)
-
## ----warning=FALSE-------------------------------------------------------
library('vtreat')
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+haveGGPlot2 <- requireNamespace("ggplot2",quietly=TRUE)
+if(haveGGPlot2) {
library('ggplot2')
}
@@ -58,7 +42,7 @@ d$simpleGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pSimple)
tapply(d$y,d$simpleGroup,mean)
# standard error of mean(y)
sd(tapply(d$y,d$simpleGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
# plot the distribution of y in each fold
ggplot(data=d,aes(x=y,color=as.factor(simpleGroup))) +
geom_density() + ggtitle('simple (unstratified) grouping')
@@ -67,7 +51,7 @@ if(requireNamespace("ggplot2",quietly=TRUE)) {
tapply(d$y,d$stratGroup,mean)
# standard error of mean(y)
sd(tapply(d$y,d$stratGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
# plot the distribution of y in each fold
ggplot(data=d,aes(x=y,color=as.factor(stratGroup))) +
geom_density() + ggtitle('y-stratified grouping')
diff --git a/inst/doc/vtreatSplitting.Rmd b/inst/doc/vtreatSplitting.Rmd
index 2d5dac42..8358e904 100644
--- a/inst/doc/vtreatSplitting.Rmd
+++ b/inst/doc/vtreatSplitting.Rmd
@@ -87,30 +87,7 @@ As we can see `vtreat::oneWayHoldout` builds three split sets where in each set
The function `buildEvalSets` takes one of the above splitting functions as input and returns a cross-validation plan that instantiates the desired splitting, while also guarding against corner cases. You can also explicitly specify the splitting plan when designing a vtreat variable treatment plan using `designTreatments[N\C]` or `mkCrossFrame[N\C]Experiment`.
-For issues beyond stratification the user may want to supply their own splitting plan. For example to wrap [`caret::createFolds`](http://topepo.github.io/caret/index.html) as a splitting function we would write the following function definition.
-
-```{r}
-splitFn <- function(nRows,nSplits,dframe,y) {
- if(requireNamespace("caret",quietly=TRUE)) {
- fullSeq <- seq_len(nRows)
- part <- caret::createFolds(y=y,k=nSplits)
- lapply(part,
- function(appi) {
- list(train=setdiff(fullSeq,appi),app=appi)
- })
- } else {
- NULL # fall back to vtreat implementation
- }
-}
-```
-
-This function can then be passed into any `vtreat` operation that takes a `splitFunction` argument (such as `mkCrossFrameNExperiment`, `designTreatmentsN`, and many more). For example we can pass the user defined `splitFn` into `vtreat::buildEvalSets` as follows:
-
-```{r}
-vtreat::buildEvalSets(25,y=1:25,splitFunction=splitFn)
-```
-
-As stated above, the vtreat library code will try to use the user function for splitting, but will fall back to an appropriate vtreat function in corner cases that the user function may not handle (for example, too few rows, too few groups, and so on). Thus the user code can assume it is in a reasonable situation (and even safely return NULL if it can’t deal with the situation it is given).
+For issues beyond stratification the user may want to supply their own splitting plan. Such a function can then be passed into any `vtreat` operation that takes a `splitFunction` argument (such as `mkCrossFrameNExperiment`, `designTreatmentsN`, and many more). For example we can pass the user defined `splitFn` into `vtreat::buildEvalSets` as follows:
The file [outOfSample.R](https://github.com/WinVector/vtreat/blob/master/R/outOfSample.R) is full of worked examples. In particular we would suggest running the code displayed when you type any of:
@@ -124,10 +101,12 @@ For example from `help(kWayStratifiedY)` we can see that the distribution of `y`
```{r warning=FALSE}
library('vtreat')
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+haveGGPlot2 <- requireNamespace("ggplot2",quietly=TRUE)
+if(haveGGPlot2) {
library('ggplot2')
}
```
+
```{r}
set.seed(23255)
d <- data.frame(y=sin(1:100))
@@ -158,7 +137,7 @@ d$simpleGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pSimple)
tapply(d$y,d$simpleGroup,mean)
# standard error of mean(y)
sd(tapply(d$y,d$simpleGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
# plot the distribution of y in each fold
ggplot(data=d,aes(x=y,color=as.factor(simpleGroup))) +
geom_density() + ggtitle('simple (unstratified) grouping')
@@ -167,7 +146,7 @@ if(requireNamespace("ggplot2",quietly=TRUE)) {
tapply(d$y,d$stratGroup,mean)
# standard error of mean(y)
sd(tapply(d$y,d$stratGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
# plot the distribution of y in each fold
ggplot(data=d,aes(x=y,color=as.factor(stratGroup))) +
geom_density() + ggtitle('y-stratified grouping')
diff --git a/inst/doc/vtreatSplitting.html b/inst/doc/vtreatSplitting.html
index 4eaa5bba..1f1d10af 100644
--- a/inst/doc/vtreatSplitting.html
+++ b/inst/doc/vtreatSplitting.html
@@ -12,7 +12,7 @@
-
+
vtreat splitting
@@ -70,7 +70,7 @@
vtreat splitting
John Mount, Nina Zumel
-2016-06-27
+2016-07-11
@@ -157,48 +157,7 @@ Examples
makekWayCrossValidationGroupedByColumn
: k-way y-stratified cross-validation that preserves grouping (for example, all rows corresponding to a single customer or patient, etc). This is a complex splitting plan, and only recommended when absolutely needed.
The function buildEvalSets
takes one of the above splitting functions as input and returns a cross-validation plan that instantiates the desired splitting, while also guarding against corner cases. You can also explicitly specify the splitting plan when designing a vtreat variable treatment plan using designTreatments[N\C]
or mkCrossFrame[N\C]Experiment
.
-For issues beyond stratification the user may want to supply their own splitting plan. For example to wrap caret::createFolds
as a splitting function we would write the following function definition.
-splitFn <- function(nRows,nSplits,dframe,y) {
- if(requireNamespace("caret",quietly=TRUE)) {
- fullSeq <- seq_len(nRows)
- part <- caret::createFolds(y=y,k=nSplits)
- lapply(part,
- function(appi) {
- list(train=setdiff(fullSeq,appi),app=appi)
- })
- } else {
- NULL # fall back to vtreat implementation
- }
-}
-This function can then be passed into any vtreat
operation that takes a splitFunction
argument (such as mkCrossFrameNExperiment
, designTreatmentsN
, and many more). For example we can pass the user defined splitFn
into vtreat::buildEvalSets
as follows:
-vtreat::buildEvalSets(25,y=1:25,splitFunction=splitFn)
-## $Fold1
-## $Fold1$train
-## [1] 1 3 4 6 7 10 11 12 13 15 16 17 18 20 23 24 25
-##
-## $Fold1$app
-## [1] 2 5 8 9 14 19 21 22
-##
-##
-## $Fold2
-## $Fold2$train
-## [1] 2 5 6 7 8 9 10 13 14 17 18 19 20 21 22 24
-##
-## $Fold2$app
-## [1] 1 3 4 11 12 15 16 23 25
-##
-##
-## $Fold3
-## $Fold3$train
-## [1] 1 2 3 4 5 8 9 11 12 14 15 16 19 21 22 23 25
-##
-## $Fold3$app
-## [1] 6 7 10 13 17 18 20 24
-##
-##
-## attr(,"splitmethod")
-## [1] "userfunction"
-As stated above, the vtreat library code will try to use the user function for splitting, but will fall back to an appropriate vtreat function in corner cases that the user function may not handle (for example, too few rows, too few groups, and so on). Thus the user code can assume it is in a reasonable situation (and even safely return NULL if it can’t deal with the situation it is given).
+For issues beyond stratification the user may want to supply their own splitting plan. Such a function can then be passed into any vtreat
operation that takes a splitFunction
argument (such as mkCrossFrameNExperiment
, designTreatmentsN
, and many more). For example we can pass the user defined splitFn
into vtreat::buildEvalSets
as follows:
The file outOfSample.R is full of worked examples. In particular we would suggest running the code displayed when you type any of:
help(oneWayHoldout)
@@ -208,7 +167,8 @@ Examples
For example from help(kWayStratifiedY)
we can see that the distribution of y
is much more similar in each fold when we stratify than when we don’t:
library('vtreat')
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+haveGGPlot2 <- requireNamespace("ggplot2",quietly=TRUE)
+if(haveGGPlot2) {
library('ggplot2')
}
set.seed(23255)
@@ -245,7 +205,7 @@ Examples
# standard error of mean(y)
sd(tapply(d$y,d$simpleGroup,mean))
## [1] 0.1019753
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
# plot the distribution of y in each fold
ggplot(data=d,aes(x=y,color=as.factor(simpleGroup))) +
geom_density() + ggtitle('simple (unstratified) grouping')
@@ -258,7 +218,7 @@ Examples
# standard error of mean(y)
sd(tapply(d$y,d$stratGroup,mean))
## [1] 0.01141606
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
# plot the distribution of y in each fold
ggplot(data=d,aes(x=y,color=as.factor(stratGroup))) +
geom_density() + ggtitle('y-stratified grouping')
diff --git a/inst/doc/vtreatVariableTypes.html b/inst/doc/vtreatVariableTypes.html
index e305307e..70fa2ca3 100644
--- a/inst/doc/vtreatVariableTypes.html
+++ b/inst/doc/vtreatVariableTypes.html
@@ -12,7 +12,7 @@
-
+
Variable Types
@@ -70,7 +70,7 @@
Variable Types
Win-Vector LLC
-2016-06-27
+2016-07-11
@@ -96,13 +96,13 @@ When the target to predict is categorical
z=c(1,2,3,4,NA,6),y=c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE),
stringsAsFactors = FALSE)
treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE)
-## [1] "desigining treatments Mon Jun 27 08:57:39 2016"
-## [1] "design var x Mon Jun 27 08:57:39 2016"
-## [1] "design var z Mon Jun 27 08:57:39 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:39 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:39 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:39 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:39 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:59 2016"
+## [1] "design var x Mon Jul 11 10:06:59 2016"
+## [1] "design var z Mon Jul 11 10:06:59 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:59 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:59 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:59 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:59 2016"
print(treatmentsC$scoreFrame[,c('origName','varName','code','varMoves','sig')])
## origName varName code varMoves sig
## 1 x x_lev_NA lev TRUE 0.20766228
@@ -142,13 +142,13 @@ When the target to predict is numeric
z=c(1,2,3,4,NA,6),y=as.numeric(c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)),
stringsAsFactors = FALSE)
treatmentsN <- designTreatmentsN(dTrainN,colnames(dTrainN),'y')
-## [1] "desigining treatments Mon Jun 27 08:57:39 2016"
-## [1] "design var x Mon Jun 27 08:57:39 2016"
-## [1] "design var z Mon Jun 27 08:57:39 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:39 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:39 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:39 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:39 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:59 2016"
+## [1] "design var x Mon Jul 11 10:06:59 2016"
+## [1] "design var z Mon Jul 11 10:06:59 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:59 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:59 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:59 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:59 2016"
print(treatmentsN$scoreFrame[,c('origName','varName','code','varMoves','sig')])
## origName varName code varMoves sig
## 1 x x_lev_NA lev TRUE 0.3739010
@@ -178,11 +178,11 @@ When there is no supplied target to predict
z=c(1,2,3,4,NA,6),
stringsAsFactors = FALSE)
treatmentsZ <- designTreatmentsZ(dTrainZ,colnames(dTrainZ))
-## [1] "desigining treatments Mon Jun 27 08:57:39 2016"
-## [1] "design var x Mon Jun 27 08:57:39 2016"
-## [1] "design var z Mon Jun 27 08:57:39 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:39 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:39 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:59 2016"
+## [1] "design var x Mon Jul 11 10:06:59 2016"
+## [1] "design var z Mon Jul 11 10:06:59 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:59 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:59 2016"
print(treatmentsZ$scoreFrame[,c('origName','varName','code','varMoves')])
## origName varName code varMoves
## 1 x x_catP catP TRUE
@@ -213,13 +213,13 @@ Overall
z=c(1,2,3,4,NA,6),y=as.numeric(c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)),
stringsAsFactors = FALSE)
treatmentsN <- designTreatmentsN(dTrainN,colnames(dTrainN),'y')
-## [1] "desigining treatments Mon Jun 27 08:57:39 2016"
-## [1] "design var x Mon Jun 27 08:57:39 2016"
-## [1] "design var z Mon Jun 27 08:57:39 2016"
-## [1] "scoring treatments Mon Jun 27 08:57:39 2016"
-## [1] "have treatment plan Mon Jun 27 08:57:39 2016"
-## [1] "rescoring complex variables Mon Jun 27 08:57:39 2016"
-## [1] "done rescoring complex variables Mon Jun 27 08:57:39 2016"
+## [1] "desigining treatments Mon Jul 11 10:06:59 2016"
+## [1] "design var x Mon Jul 11 10:06:59 2016"
+## [1] "design var z Mon Jul 11 10:06:59 2016"
+## [1] "scoring treatments Mon Jul 11 10:06:59 2016"
+## [1] "have treatment plan Mon Jul 11 10:06:59 2016"
+## [1] "rescoring complex variables Mon Jul 11 10:06:59 2016"
+## [1] "done rescoring complex variables Mon Jul 11 10:06:59 2016"
print(treatmentsN$scoreFrame[,c('origName','varName','code','varMoves','sig')])
## origName varName code varMoves sig
## 1 x x_lev_NA lev TRUE 0.3739010
diff --git a/tests/testthat/testBO.R b/tests/testthat/testBO.R
index e8f25198..9a4ee687 100644
--- a/tests/testthat/testBO.R
+++ b/tests/testthat/testBO.R
@@ -91,8 +91,6 @@ test_that("testBO: Works As Expected", {
dTestNTreated <- prepare(treatmentsN,dTest,pruneSig=0.99,scale=scale)
dTestNTreated$pred <- predict(modelN,newdata=dTestNTreated)
if(verbose) {
- print(ggplot(data=dTestNTreated,aes(x=pred,y=yN)) + geom_point() +
- geom_smooth())
print(summary(modelN))
}
@@ -120,7 +118,6 @@ test_that("testBO: Works As Expected", {
dTestCTreated <- prepare(treatmentsC,dTest,pruneSig=0.99,scale=scale)
dTestCTreated$pred <- predict(modelC,newdata=dTestCTreated,type='response')
if(verbose) {
- print(ggplot(data=dTestCTreated) + geom_density(aes(x=pred,color=yC)))
print(summary(modelC))
}
}
diff --git a/tests/testthat/testExpmtDesign.R b/tests/testthat/testExpmtDesign.R
index 853c2415..0a569472 100644
--- a/tests/testthat/testExpmtDesign.R
+++ b/tests/testthat/testExpmtDesign.R
@@ -89,37 +89,3 @@ test_that("testExpmtDesign: makekWayCrossValidationOrderedByColumn", {
}
})
-
-
-test_that("testExpmtDesign: cross frame design caret", {
- if(requireNamespace("caret",quietly=TRUE)) {
- set.seed(2325235)
- splitFn <- function(nRows,nSplits,dframe,y) {
- fullSeq <- seq_len(nRows)
- part <- caret::createFolds(y=y,k=nSplits)
- lapply(part,
- function(appi) {
- list(train=setdiff(fullSeq,appi),app=appi)
- })
- }
- nrowd = 200
- y <- rnorm(nrowd)
- eSets <- buildEvalSets(nrowd,y=y,
- splitFunction=splitFn)
- expect_true(attr(eSets,'splitmethod')=='userfunction')
- fullSeq <- seq_len(nrowd)
- expect_true(length(eSets)>0)
- for(ei in eSets) {
- expect_true(length(ei$train)>0)
- expect_true(length(ei$app)>0)
- expect_true(all(ei$train %in% fullSeq))
- expect_true(all(ei$app %in% fullSeq))
- }
- apps <- Reduce(c,lapply(eSets,function(ei) ei$app))
- expect_true(length(apps)==nrowd)
- expect_true(length(unique(apps))==nrowd)
- problem <- problemAppPlan(nrowd,3,eSets,TRUE)
- expect_true(is.null(problem))
- }
-})
-
diff --git a/vignettes/vtreatGrouping.Rmd b/vignettes/vtreatGrouping.Rmd
index 2227b3bd..bcf4afa7 100644
--- a/vignettes/vtreatGrouping.Rmd
+++ b/vignettes/vtreatGrouping.Rmd
@@ -14,6 +14,9 @@ knitr::opts_chunk$set(fig.width = 7)
```
```{r echo=FALSE, message=FALSE, warning=FALSE}
+library(vtreat)
+set.seed(23255)
+
have_ggplot = requireNamespace("ggplot2", quietly=TRUE)
have_dplyr = requireNamespace("dplyr", quietly=TRUE)
if(have_ggplot) {
@@ -22,15 +25,9 @@ if(have_ggplot) {
if(have_dplyr) {
library(dplyr)
}
-
-
-library(vtreat)
-set.seed(23255)
```
-This vignette shows an example use of _y_-stratified sampling with a grouping restriction in `vtreat`.
-
-```{r functions, echo=FALSE}
+```{r echo=FALSE, message=FALSE, warning=FALSE}
#
# takes the frame (d) and the outcome column (d$conc)
# from the global environment
@@ -63,10 +60,10 @@ showGroupingBehavior = function(groupcol, title) {
print(plt)
}
}
-
-
```
+This vignette shows an example use of _y_-stratified sampling with a grouping restriction in `vtreat`.
+
For this example, we will use the `Theosph` dataset: data from an experiment on the pharmacokinetics of theophylline. We will demonstrate the desired effects of _y_-stratification while also respecting a grouping constraint.
## The Data
diff --git a/vignettes/vtreatSplitting.Rmd b/vignettes/vtreatSplitting.Rmd
index 2d5dac42..8358e904 100644
--- a/vignettes/vtreatSplitting.Rmd
+++ b/vignettes/vtreatSplitting.Rmd
@@ -87,30 +87,7 @@ As we can see `vtreat::oneWayHoldout` builds three split sets where in each set
The function `buildEvalSets` takes one of the above splitting functions as input and returns a cross-validation plan that instantiates the desired splitting, while also guarding against corner cases. You can also explicitly specify the splitting plan when designing a vtreat variable treatment plan using `designTreatments[N\C]` or `mkCrossFrame[N\C]Experiment`.
-For issues beyond stratification the user may want to supply their own splitting plan. For example to wrap [`caret::createFolds`](http://topepo.github.io/caret/index.html) as a splitting function we would write the following function definition.
-
-```{r}
-splitFn <- function(nRows,nSplits,dframe,y) {
- if(requireNamespace("caret",quietly=TRUE)) {
- fullSeq <- seq_len(nRows)
- part <- caret::createFolds(y=y,k=nSplits)
- lapply(part,
- function(appi) {
- list(train=setdiff(fullSeq,appi),app=appi)
- })
- } else {
- NULL # fall back to vtreat implementation
- }
-}
-```
-
-This function can then be passed into any `vtreat` operation that takes a `splitFunction` argument (such as `mkCrossFrameNExperiment`, `designTreatmentsN`, and many more). For example we can pass the user defined `splitFn` into `vtreat::buildEvalSets` as follows:
-
-```{r}
-vtreat::buildEvalSets(25,y=1:25,splitFunction=splitFn)
-```
-
-As stated above, the vtreat library code will try to use the user function for splitting, but will fall back to an appropriate vtreat function in corner cases that the user function may not handle (for example, too few rows, too few groups, and so on). Thus the user code can assume it is in a reasonable situation (and even safely return NULL if it can’t deal with the situation it is given).
+For issues beyond stratification the user may want to supply their own splitting plan. Such a function can then be passed into any `vtreat` operation that takes a `splitFunction` argument (such as `mkCrossFrameNExperiment`, `designTreatmentsN`, and many more). For example we can pass the user defined `splitFn` into `vtreat::buildEvalSets` as follows:
The file [outOfSample.R](https://github.com/WinVector/vtreat/blob/master/R/outOfSample.R) is full of worked examples. In particular we would suggest running the code displayed when you type any of:
@@ -124,10 +101,12 @@ For example from `help(kWayStratifiedY)` we can see that the distribution of `y`
```{r warning=FALSE}
library('vtreat')
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+haveGGPlot2 <- requireNamespace("ggplot2",quietly=TRUE)
+if(haveGGPlot2) {
library('ggplot2')
}
```
+
```{r}
set.seed(23255)
d <- data.frame(y=sin(1:100))
@@ -158,7 +137,7 @@ d$simpleGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pSimple)
tapply(d$y,d$simpleGroup,mean)
# standard error of mean(y)
sd(tapply(d$y,d$simpleGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
# plot the distribution of y in each fold
ggplot(data=d,aes(x=y,color=as.factor(simpleGroup))) +
geom_density() + ggtitle('simple (unstratified) grouping')
@@ -167,7 +146,7 @@ if(requireNamespace("ggplot2",quietly=TRUE)) {
tapply(d$y,d$stratGroup,mean)
# standard error of mean(y)
sd(tapply(d$y,d$stratGroup,mean))
-if(requireNamespace("ggplot2",quietly=TRUE)) {
+if(haveGGPlot2) {
# plot the distribution of y in each fold
ggplot(data=d,aes(x=y,color=as.factor(stratGroup))) +
geom_density() + ggtitle('y-stratified grouping')