From 1bc9178f56832e23978c352042602f60a6af9e19 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 22 Aug 2021 10:42:44 -0600 Subject: [PATCH] ignore datatable.nomatch option with warning (#5108) --- NEWS.md | 8 ++++++++ R/data.table.R | 2 +- R/foverlaps.R | 2 +- R/onLoad.R | 9 ++++----- man/data.table.Rd | 4 ++-- man/foverlaps.Rd | 9 ++++----- vignettes/datatable-importing.Rmd | 2 +- 7 files changed, 21 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index f6696b71e..451532fca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -307,6 +307,14 @@ > ... the basename of the DLL needs to be both a valid file name and valid as part of a C entry point (e.g. it cannot contain ‘.’): for portable code it is best to confine DLL names to be ASCII alphanumeric plus underscore. If entry point R_init_lib is not found it is also looked for with ‘.’ replaced by ‘_’. +14. For nearly two years, since v1.12.4 (Oct 2019) (note 11 below in this NEWS file), using `options(datatable.nomatch=0)` has produced the following message : + + ``` + The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option. + ``` + + The message is now upgraded to warning that the option is now ignored. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/R/data.table.R b/R/data.table.R index 504eb49cc..f13a1c7ea 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -138,7 +138,7 @@ replace_dot_alias = function(e) { } } -"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) +"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could # test explicitly if the caller is [.data.table (even stronger test. TO DO.) diff --git a/R/foverlaps.R b/R/foverlaps.R index 58c7a7555..9a0cd5580 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -1,4 +1,4 @@ -foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=key(y), maxgap=0L, minoverlap=1L, type=c("any", "within", "start", "end", "equal"), mult=c("all", "first", "last"), nomatch=getOption("datatable.nomatch", NA), which=FALSE, verbose=getOption("datatable.verbose")) { +foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=key(y), maxgap=0L, minoverlap=1L, type=c("any", "within", "start", "end", "equal"), mult=c("all", "first", "last"), nomatch=NA, which=FALSE, verbose=getOption("datatable.verbose")) { if (!is.data.table(y) || !is.data.table(x)) stopf("y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying.") maxgap = as.integer(maxgap); minoverlap = as.integer(minoverlap) diff --git a/R/onLoad.R b/R/onLoad.R index 9ad7051ff..1ee328e99 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -1,16 +1,15 @@ # nocov start -# used to raise message (write to STDERR but not raise warning) once per session only -# in future this will be upgraded to warning, then error, until eventually removed after several years .pkg.store = new.env() .pkg.store$.unsafe.done = FALSE .unsafe.opt = function() { if (.pkg.store$.unsafe.done) return(invisible()) val = getOption("datatable.nomatch") - if (is.null(val)) return(invisible()) # not set is ideal (it's no longer set in .onLoad) - if (identical(val, NA) || identical(val, NA_integer_)) return(invisible()) # set to default NA is ok for now; in future possible message/warning asking to remove - messagef("The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option.") + if (is.null(val)) return(invisible()) # not defined (it hasn't been defined in .onLoad since v1.12.4) + warningf("Option 'datatable.nomatch' is defined but is now ignored. Please see note 11 in v1.12.4 NEWS (Oct 2019), and note 14 in v1.14.2.") + # leave this as warning for a long time .pkg.store$.unsafe.done = TRUE + invisible() } .Last.updated = vector("integer", 1L) # exported variable; number of rows updated by the last := or set(), #1885 diff --git a/man/data.table.Rd b/man/data.table.Rd index fd2bbd450..7ec8cec3a 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -26,7 +26,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE) \method{[}{data.table}(x, i, j, by, keyby, with = TRUE, - nomatch = getOption("datatable.nomatch", NA), + nomatch = NA, mult = "all", roll = FALSE, rollends = if (roll=="nearest") c(TRUE,TRUE) @@ -121,7 +121,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac When \code{j} is a character vector of column names, a numeric vector of column positions to select or of the form \code{startcol:endcol}, and the value returned is always a \code{data.table}. \code{with=FALSE} is not necessary anymore to select columns dynamically. Note that \code{x[, cols]} is equivalent to \code{x[, ..cols]} and to \code{x[, cols, with=FALSE]} and to \code{x[, .SD, .SDcols=cols]}.} - \item{nomatch}{ When a row in \code{i} has no match to \code{x}, \code{nomatch=NA} (default) means \code{NA} is returned. \code{NULL} (or \code{0} for backward compatibility) means no rows will be returned for that row of \code{i}. Use \code{options(datatable.nomatch=NULL)} to change the default value (used when \code{nomatch} is not supplied).} + \item{nomatch}{ When a row in \code{i} has no match to \code{x}, \code{nomatch=NA} (default) means \code{NA} is returned. \code{NULL} (or \code{0} for backward compatibility) means no rows will be returned for that row of \code{i}. } \item{mult}{ When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}) and \emph{multiple} rows in \code{x} match to the row in \code{i}, \code{mult} controls which are returned: \code{"all"} (default), \code{"first"} or \code{"last"}.} diff --git a/man/foverlaps.Rd b/man/foverlaps.Rd index e90d25133..c8f72117c 100644 --- a/man/foverlaps.Rd +++ b/man/foverlaps.Rd @@ -20,7 +20,7 @@ foverlaps(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.y = key(y), maxgap = 0L, minoverlap = 1L, type = c("any", "within", "start", "end", "equal"), mult = c("all", "first", "last"), - nomatch = getOption("datatable.nomatch", NA), + nomatch = NA, which = FALSE, verbose = getOption("datatable.verbose")) } \arguments{ @@ -66,11 +66,10 @@ of the overlap. This will be updated once \code{maxgap} is implemented.} match in \code{y}, \code{nomatch=NA} (default) means \code{NA} is returned for \code{y}'s non-\code{by.y} columns for that row of \code{x}. \code{nomatch=NULL} (or \code{0} for backward compatibility) means no rows will be returned for that -row of \code{x}. Use \code{options(datatable.nomatch=NULL)} to change the default -value (used when \code{nomatch} is not supplied).} +row of \code{x}. } \item{which}{ When \code{TRUE}, if \code{mult="all"} returns a two column \code{data.table} with the first column corresponding to \code{x}'s row number -and the second corresponding to \code{y}'s. when \code{nomatch=NA}, no matches +and the second corresponding to \code{y}'s. When \code{nomatch=NA}, no matches return \code{NA} for \code{y}, and if \code{nomatch=NULL}, those rows where no match is found will be skipped; if \code{mult="first" or "last"}, a vector of length equal to the number of rows in \code{x} is returned, with no-match entries @@ -116,7 +115,7 @@ NB: When \code{which=TRUE}: \code{a)} \code{mult="first" or "last"} returns a containing row numbers of \code{x} and the second column with corresponding row numbers of \code{y}. -\code{nomatch=NA or 0} also influences whether non-matching rows are returned +\code{nomatch=NA|NULL} also influences whether non-matching rows are returned or not, as explained above. } diff --git a/vignettes/datatable-importing.Rmd b/vignettes/datatable-importing.Rmd index 689e68903..41a3d629a 100644 --- a/vignettes/datatable-importing.Rmd +++ b/vignettes/datatable-importing.Rmd @@ -132,7 +132,7 @@ If you don't mind having `id` and `grp` registered as variables globally in your Common practice by R packages is to provide customization options set by `options(name=val)` and fetched using `getOption("name", default)`. Function arguments often specify a call to `getOption()` so that the user knows (from `?fun` or `args(fun)`) the name of the option controlling the default for that parameter; e.g. `fun(..., verbose=getOption("datatable.verbose", FALSE))`. All `data.table` options start with `datatable.` so as to not conflict with options in other packages. A user simply calls `options(datatable.verbose=TRUE)` to turn on verbosity. This affects all calls to `fun()` other the ones which have been provided `verbose=` explicity; e.g. `fun(..., verbose=FALSE)`. -The option mechanism in R is _global_. Meaning that if a user sets a `data.table` option for their own use, that setting also affects code inside any package that is using `data.table` too. For an option like `datatable.verbose`, this is exactly the desired behavior since the desire is to trace and log all `data.table` operations from wherever they originate; turning on verbosity does not affect the results. Another unique-to-R and excellent-for-production option is R's `options(warn=2)` which turns all warnings into errors. Again, the desire is to affect any warning in any package so as to not missing any warnings in production. There are 6 `datatable.print.*` options and 3 optimization options which do not affect the result of operations, either. However, there is one `data.table` option that does and is now a concern: `datatable.nomatch`. This option changes the default join from outer to inner. [Aside, the default join is outer because outer is safer; it doesn't drop missing data silently; moreover it is consistent to base R way of matching by names and indices.] Some users prefer inner join to be the default and we provided this option for them. However, a user setting this option can unintentionally change the behavior of joins inside packages that use `data.table`. Accordingly, in v1.12.4, we have started the process to deprecate the `datatable.nomatch` option. It is the only `data.table` option with this concern. +The option mechanism in R is _global_. Meaning that if a user sets a `data.table` option for their own use, that setting also affects code inside any package that is using `data.table` too. For an option like `datatable.verbose`, this is exactly the desired behavior since the desire is to trace and log all `data.table` operations from wherever they originate; turning on verbosity does not affect the results. Another unique-to-R and excellent-for-production option is R's `options(warn=2)` which turns all warnings into errors. Again, the desire is to affect any warning in any package so as to not miss any warnings in production. There are 6 `datatable.print.*` options and 3 optimization options which do not affect the result of operations. However, there is one `data.table` option that does and is now a concern: `datatable.nomatch`. This option changes the default join from outer to inner. [Aside, the default join is outer because outer is safer; it doesn't drop missing data silently; moreover it is consistent to base R way of matching by names and indices.] Some users prefer inner join to be the default and we provided this option for them. However, a user setting this option can unintentionally change the behavior of joins inside packages that use `data.table`. Accordingly, in v1.12.4 (Oct 2019) a message was printed when the `datatable.nomatch` option was used, and from v1.14.2 it is now ignored with warning. It was the only `data.table` option with this concern. ## Troubleshooting