diff --git a/CRAN_Release.cmd b/CRAN_Release.cmd index 598dcd9ab..ecc5f79d4 100644 --- a/CRAN_Release.cmd +++ b/CRAN_Release.cmd @@ -218,7 +218,7 @@ sudo apt-get -y build-dep r-cran-cairodevice sudo apt-get -y build-dep r-cran-tkrplot sudo apt-get -y install libcurl4-openssl-dev # needed by devtools sudo apt-get -y install xorg-dev x11-common libgdal-dev libproj-dev mysql-client libcairo2-dev libglpk-dev -sudo apt-get -y install texlive texlive-latex-extra texlive-bibtex-extra texinfo fonts-inconsolata latex-xcolor +sudo apt-get -y install texlive texlive-latex-extra texlive-bibtex-extra texlive-science texinfo fonts-inconsolata latex-xcolor sudo apt-get -y install libv8-dev sudo apt-get -y install gsl-bin libgsl0-dev sudo apt-get -y install libgtk2.0-dev netcdf-bin @@ -234,23 +234,24 @@ sudo apt-get -y install libfftw3-dev # for package fftwtools sudo apt-get -y install libgsl-dev sudo apt-get -y install octave liboctave-dev sudo apt-get -y install jags +sudo apt-get -y install libmpfr-dev +sudo apt-get -y install bwidget sudo R CMD javareconf # ENDIF -sudo R -update.packages(ask=FALSE) -q('no') - cd ~/build/revdeplib/ export R_LIBS=~/build/revdeplib/ -R CMD INSTALL ~/data.table_1.9.9.tar.gz # ** ensure latest version installed into revdeplib ** +export R_LIBS_SITE=none R +.libPaths() # should be just 2 items: revdeplib and the base R package library +update.packages(ask=FALSE) # Follow: https://bioconductor.org/install/#troubleshoot-biocinstaller # Ensure no library() call in .Rprofile, such as library(bit64) source("http://bioconductor.org/biocLite.R") biocLite() -biocLite("BiocUpgrade") +# biocLite("BiocUpgrade") +# This error means it's up to date: "Bioconductor version 3.4 cannot be upgraded with R version 3.3.2" avail = available.packages(repos=c(getOption("repos"), BiocInstaller::biocinstallRepos()[["BioCsoft"]])) deps = tools::package_dependencies("data.table", db=avail, which="most", reverse=TRUE, recursive=FALSE)[[1]] @@ -263,7 +264,7 @@ for (p in deps) { system(paste0("rm -f ", p, "*.tar.gz")) # Remove previous *.tar.gz. -f to be silent if not there (i.e. first time seeing this package) system(paste0("rm -rf ", p, ".Rcheck")) # Remove last check (of previous version) if (!length(grep("bioc",avail[p,"Repository"]))) { - install.packages(p) # To install its dependencies really. + install.packages(p, dependencies=TRUE) # To install its dependencies really. } else { biocLite(p,suppressUpdates=TRUE) # To install its dependencies really. } @@ -282,15 +283,22 @@ table(avail[deps,"Repository"]) # To identify and remove the tar.gz no longer needed : for (p in deps) { f = paste0(p, "_", avail[p,"Version"], ".tar.gz") - # system(paste0("mv ",f," ",f,"_TEMP")) + system(paste0("mv ",f," ",f,"_TEMP")) +} +system("rm *.tar.gz") +for (p in deps) { + f = paste0(p, "_", avail[p,"Version"], ".tar.gz") system(paste0("mv ",f,"_TEMP ",f)) } -R CMD INSTALL ~/data.table_1.9.9.tar.gz # ** ensure latest version installed into revdeplib ** +cd ~/build/revdeplib/ +export R_LIBS=~/build/revdeplib/ +export R_LIBS_SITE=none export _R_CHECK_FORCE_SUGGESTS_=false # in my profile so always set +R CMD INSTALL ~/data.table_1.10.1.tar.gz # ** ensure latest version installed into revdeplib ** rm -rf *.Rcheck # delete all previous .Rcheck directories ls -1 *.tar.gz | wc -l # check this equals length(deps) above -time ls -1 *.tar.gz | parallel R CMD check # apx 2 hrs for 291 packages on my 4 cpu laptop with 8 threads +time ls -1 *.tar.gz | parallel R CMD check # apx 2.5 hrs for 313 packages on my 4 cpu laptop with 8 threads status = function(which="both") { if (which=="both") { @@ -339,6 +347,8 @@ R CMD INSTALL ~/data.table_1.9.6.tar.gz # CRAN version to establish if fails a R CMD check .tar.gz ls -1 *.tar.gz | grep -E 'Chicago|dada2|flowWorkspace|LymphoSeq' | parallel R CMD check +# Warning: replacing previous import robustbase::sigma by stats::sigma when loading VIM +# Reinstalling robustbase fixed this warning. Even though it was up to date, reinstalling made a difference. diff --git a/NEWS.md b/NEWS.md index 08bf39b4c..84a4add07 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,13 +3,31 @@ #### NEW FEATURES -1. `indices()` function gain new argument `vectors` default `FALSE`, when `TRUE` provided then list of vectors is returned, single vector refers to single index. Closes #1589. +1. `indices()` function gains new argument `vectors` default `FALSE`, when `TRUE` provided then list of vectors is returned, single vector refers to single index. Closes #1589. + +2. When `j` is a symbol prefixed with `..`, it will be looked up in calling scope and its value taken to be column names or numbers. +```R + myCols = c("colA","colB") + DT[, myCols, with=FALSE] + DT[, ..myCols] # same +``` +When you see the `..` prefix think _one-level-up_ like the directory `..` in all operating systems mean the parent directory, `..` means the parent scope. In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. It is intended to be a convenient way to protect your code from accidentally picking up a column name. Similar to how `x.` and `i.` prefixes can already be used to disambiguate the same column name in `x` and `i`; analogous to SQL table aliases. A symbol prefix rather than a `..()` _function_ will be easier for us to optimize internally and will be more convenient if you have many variables in calling scope that you wish to use in your expressions safely. This feature was first raised in 2012 and long wished for, [#633](https://github.com/Rdatatable/data.table/issues/633). It is experimental. #### BUG FIXES #### NOTES -1. `fwrite()`'s `..turbo` option has been removed as the warning message warned. We don't think there are any problems with `..turbo=TRUE` so there is no need to fall back to `FALSE`. If you've found a problem, please [report it](https://github.com/Rdatatable/data.table/issues). +1. `fwrite()`'s `..turbo` option has been removed as the warning message warned. We aren't aware of any problems with `..turbo=TRUE` so there is no need to fall back to `FALSE`. If you've found a problem, please [report it](https://github.com/Rdatatable/data.table/issues). + +2. No known issues have arisen due to `DT[,1]`, `DT[,c("colA","colB")]` etc now returning columns as in introduced in v1.9.8. However, as we've moved forward by setting `options('datatable.WhenJisSymbolThenCallingScope'=TRUE)` introduced then too, it has become clear a better solution is needed. All 313 CRAN and Bioconductor packages that use data.table have been checked with this option on. 331 lines would need to be changed in 59 packages. Their usage is elegant, correct and recommended, though. Examples are `DT[1, encoding]` in quanteda and `DT[winner=="first", freq]` in xgboost. These are looking up the columns `encoding` and `freq` respectively and returning them as vectors. But if, for some reason, those columns are removed from `DT` but `encoding` or `freq` are still in calling scope, their values in calling scope would be returned. Which cannot be what was intended and could lead to silent bugs. That was the risk we are trying to avoid. + +The new strategy needs no code changes and has no breakage. It was proposed and discussed in point 2 here [here](https://github.com/Rdatatable/data.table/issues/1188#issuecomment-127824969), as follows. + +options(`datatable.WhenJisSymbolThenCallingScope`) is now removed. A migration timeline is no longer needed. + +When `j` is a symbol (as in the quanteda and xgboost examples above) it will continue to be looked up as a column name and returned as a vector, as has always been the case. If it's not a column name however, it is now a helpful error explaining that data.table is different to data.frame and what to do instead (use `..` prefix or `with=FALSE`). The old behaviour of returning the symbol's value in calling scope can never have been useful to anybody and therefore not depended on. Just as the `DT[,1]` change could be made in v1.9.8, this change can be made now. This change increases robustness with no downside. Rerunning all 313 CRAN and Bioconductor package checks reveal 2 packages now throwing this error: partool and simcausal. Their maintainers have been informed that there is a likely bug on those lines due to data.table's weakness. This is exactly what we wanted to reveal and improve. + +3. As before, and as we can see is in common use in CRAN and Bioconductor packages using data.table, `DT[,myCols,with=FALSE]` continues to lookup `myCols` in calling scope and take its value as column names or numbers. You can move to the new experimental convenience feature `DT[, ..myCols]` if you wish at leisure. ### Changes in v1.10.0 (on CRAN 3 Dec 2016) diff --git a/R/data.table.R b/R/data.table.R index 97da35eff..d436622d6 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -399,34 +399,33 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { ( !length(all.vars(jsub)) && root %chin% c("","c","paste","paste0","-","!") && missing(by) )) { # test 763. TODO: likely that !missing(by) iff with==TRUE (so, with can be removed) - # When no variable names occur in j, scope doesn't matter because there are no symbols to find. - # Auto set with=FALSE in this case so that DT[,1], DT[,2:3], DT[,"someCol"] and DT[,c("colB","colD")] + # When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find. + # Automatically set with=FALSE in this case so that DT[,1], DT[,2:3], DT[,"someCol"] and DT[,c("colB","colD")] # work as expected. As before, a vector will never be returned, but a single column data.table # for type consistency with >1 cases. To return a single vector use DT[["someCol"]] or DT[[3]]. - # The root==":" is to allow DT[,colC:colH] even though that contains two variable names - # root either "-" or "!" is for tests 1504.11 and 1504.13 (a : with a ! or - modifier root) + # The root==":" is to allow DT[,colC:colH] even though that contains two variable names. + # root == "-" or "!" is for tests 1504.11 and 1504.13 (a : with a ! or - modifier root) # This change won't break anything because it didn't do anything anyway; i.e. used to return the - # j value straight back: DT[,1] == 1 which isn't possibly useful. It was that was for consistency - # of learning, since it was simpler to state that j always gets eval'd within the scope of DT. + # j value straight back: DT[,1] == 1 which isn't possibly useful. It was that for consistency + # of learning since it was simpler to state that j always gets eval'd within the scope of DT. # We don't want to evaluate j at all in making this decision because i) evaluating itself could # increment some variable and not intended to be evaluated a 2nd time later on and ii) we don't # want decisions like this to depend on the data or vector lengths since that can introduce # inconistency reminiscent of drop in [.data.table that we seek to avoid. - #if (verbose) cat("Auto with=FALSE because j with=FALSE - } else if (is.name(jsub) && isTRUE(getOption("datatable.WhenJisSymbolThenCallingScope"))) { - # Allow future behaviour to be turned on. Current default is FALSE. - # Use DT[["someCol"]] or DT$someCol to fetch that column as vector, regardless of this option. - if (!missingwith && isTRUE(with)) { - # unusual edge case only when future option has been turned on - stop('j is a single symbol, WhenJisSymbol is turned on but with=TRUE has been passed explicitly. Please instead use DT[,"someVar"], DT[,.(someVar)] or DT[["someVar"]]') - } else { - with=FALSE - } + } else if (is.name(jsub)) { jsubChar = as.character(jsub) - if (!exists(jsubChar, where=parent.frame()) && jsubChar %chin% names(x)) { - # Would fail anyway with 'object 'a' not found' but give a more helpful error. Thanks to Jan's suggestion. - stop("The option 'datatable.WhenJisSymbolThenCallingScope' is TRUE so looking for the variable '", jsubChar, "' in calling scope but it is not found there. It is a column name though. So, most likely, please wrap with quotes (i.e. DT[,'", jsubChar, "']) to return a 1-column data.table or if you need the column as a plain vector then DT[['",jsubChar,"']] or DT$",jsubChar) + if (substring(jsubChar,1,2) == "..") { + if (nchar(jsubChar)==2) stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.") + if (!exists(jsubChar, where=parent.frame())) { + # We have recommended manual ".." prefix in the past so that needs to keep working and take precedence + jsub = as.name(jsubChar<-substring(jsubChar,3)) + } + with = FALSE + } + if (!with && !exists(jsubChar, where=parent.frame())) { + # Would fail anyway with 'object 'a' not found' but give a more helpful error. Thanks to Jan's suggestion. + stop("Variable '",jsubChar,"' is not found in calling scope. Looking in calling scope because either you used the .. prefix or set with=FALSE") } } if (root=="{") { @@ -1434,10 +1433,14 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { # Since .SD is inside SDenv, alongside its columns as variables, R finds .SD symbol more quickly, if used. # There isn't a copy of the columns here, the xvar symbols point to the SD columns (copy-on-write). - # Temp fix for #921 - check address and copy *after* evaluating 'jval' + if (is.name(jsub) && is.null(lhs) && !exists(jsubChar<-as.character(jsub), SDenv, inherits=FALSE)) { + stop("j (the 2nd argument inside [...]) is a single symbol but column name '",jsubChar,"' is not found. Perhaps you intended DT[,..",jsubChar,"] or DT[,",jsubChar,",with=FALSE]. This difference to data.frame is deliberate and explained in FAQ 1.1.") + } + jval = eval(jsub, SDenv, parent.frame()) # copy 'jval' when required # More speedup - only check + copy if irows is NULL + # Temp fix for #921 - check address and copy *after* evaluating 'jval' if (is.null(irows)) { if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg. jcpy = address(jval) %in% sapply(SDenv$.SD, address) # %chin% errors when RHS is list() diff --git a/R/onAttach.R b/R/onAttach.R index ae2d4816b..8ac877df5 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -17,10 +17,11 @@ if (dev && (Sys.Date() - as.Date(d))>28) packageStartupMessage("**********\nThis development version of data.table was built more than 4 weeks ago. Please update.\n**********") if (!.Call(ChasOpenMP)) - packageStartupMessage("**********\nThis installation of data.table has not detected OpenMP support. It will still work but in single-threaded mode.\n**********") + packageStartupMessage("**********\nThis installation of data.table has not detected OpenMP support. It will still work but in single-threaded mode. If this a Mac and you obtained CRAN's Mac binary, CRAN's Mac does not yet support OpenMP. In the meantime please follow our Mac installation instructions on the data.table homepage. If it works and you observe benefits from multiple threads, please convince Simon Ubanek by sending him evidence and ask him to turn on OpenMP support in CRAN's Mac binary. Alternatives are to install Ubuntu on your Mac (which I have done and works well) or use Windows where OpenMP is supported and works well.\n**********") packageStartupMessage(' The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way') packageStartupMessage(' Documentation: ?data.table, example(data.table) and browseVignettes("data.table")') packageStartupMessage(' Release notes, videos and slides: http://r-datatable.com') } } + diff --git a/R/onLoad.R b/R/onLoad.R index 3f7dc0882..e67a4e78f 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -45,9 +45,8 @@ "datatable.fread.datatable"="TRUE", "datatable.fread.dec.experiment"="TRUE", # temp. will remove once stable "datatable.fread.dec.locale"=if (.Platform$OS.type=="unix") "'fr_FR.utf8'" else "'French_France.1252'", - "datatable.prettyprint.char" = NULL, # FR #1091 - "datatable.old.unique.by.key" = "FALSE", # TODO: warn 1 year, remove after 2 years - "datatable.WhenJisSymbolThenCallingScope" = "FALSE" # TODO: warn (asking user to change to DT[,"someCol"] or DT[["someCol"]], then change default, then remove. + "datatable.prettyprint.char" = NULL, # FR #1091 + "datatable.old.unique.by.key" = "FALSE" # TODO: warn 1 year, remove after 2 years ) for (i in setdiff(names(opts),names(options()))) { eval(parse(text=paste("options(",i,"=",opts[i],")",sep=""))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 623288426..481d02bd1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -38,7 +38,6 @@ started.at = Sys.time() # Test default values in case user set global option. These are restored # at the end of this file. oldalloccol = options(datatable.alloccol=1024L) -oldWhenJsymbol = options(datatable.WhenJisSymbolThenCallingScope=FALSE) if (!.devtesting) { test = data.table:::test @@ -235,12 +234,10 @@ test(76, TESTDT[,2:3], TESTDT[,2:3,with=FALSE]) test(77, TESTDT[,2:3,with=FALSE], data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) test(78, TESTDT[,c("b","v")], data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) colsVar = c("b","v") -test(79.1, TESTDT[,colsVar], colsVar) -test(79.2, TESTDT[,colsVar,with=FALSE], data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) -options(datatable.WhenJisSymbolThenCallingScope=TRUE) -test(80.1, TESTDT[,colsVar], data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) -test(80.2, TESTDT[,colsVar,with=TRUE], error="is turned on.*Please") -options(datatable.WhenJisSymbolThenCallingScope=FALSE) +test(79.1, TESTDT[,colsVar], error="column name 'colsVar' is not found") +test(79.2, TESTDT[,colsVar,with=FALSE], ans<-data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) +test(79.3, TESTDT[, ..colsVar], ans) + # works in test.data.table, but not eval(body(test.data.table)) when in R CMD check ... test(81, TESTDT[1:2,c(a,b)], factor(c("a","c","e","e"))) # It is expected the above to be common source of confusion. c(a,b) is evaluated within @@ -9280,13 +9277,14 @@ test(1712, DT[!is.na(a) & d == "3"], DT[2]) test(1713, DT[d==3], DT[2]) # Test new helpful error message suggested by Jan -test(1714, !exists("aColumnOfDT")) # use a long column name to be sure not used before in this test script -DT = data.table(a=1:3, aColumnOfDT=4:6) -options(datatable.WhenJisSymbolThenCallingScope=FALSE) -test(1715, DT[,aColumnOfDT], 4:6) # old behaviour for sure tested before but here for context -options(datatable.WhenJisSymbolThenCallingScope=TRUE) -test(1716, DT[,aColumnOfDT], error=".*datatable.WhenJisSymbolThenCallingScope.*TRUE.*aColumnOfDT.*wrap with quotes.*aColumnOfDT.*aColumnOfDT.*aColumnOfDT") # The .* are to check the salient points of the error (e.g. that it names the variable correctly 4 times), without getting too picky over the exact wording and punctuation of the full error -options(datatable.WhenJisSymbolThenCallingScope=FALSE) +notAColName = 1 +test(1714.1, exists("notAColName")) # use a long column name to be sure it exists and unique +test(1714.1, !exists("notInCallingScope")) # use a long column name to be sure it exists and unique +DT = data.table(a=1:3, b=4:6) +test(1715, DT[,b], 4:6) # old behaviour for sure tested before but here for context +test(1716.1, DT[,notAColName], error="column name 'notAColName' is not found") # ensure it doesn't find it calling scope either +test(1716.2, DT[, ..notInCallingScope], error="Variable 'notInCallingScope' is not found in calling scope") +test(1716.3, DT[, notInCallingScope, with=FALSE], error="Variable 'notInCallingScope' is not found in calling scope") # Test out-of-bounds error on numeric j DT = data.table(a=1:3, b=4:6, c=7:9) @@ -9822,7 +9820,7 @@ test(1749.2, indices(DT, vectors = TRUE), list(c("A","B"),"A","B")) options(warn=0) setDTthreads(0) options(oldalloccol) # set at top of this file -options(oldWhenJsymbol) + plat = paste("endian==",.Platform$endian,", sizeof(long double)==",.Machine$sizeof.longdouble, ", sizeof(pointer)==",.Machine$sizeof.pointer, sep="") if (nfail > 0) {