diff --git a/NEWS.md b/NEWS.md index 64161c728..f06fde969 100644 --- a/NEWS.md +++ b/NEWS.md @@ -73,6 +73,8 @@ 7. New variable `.Last.updated` (similar to R's `.Last.value`) contains the number of rows affected by the most recent `:=` or `set()`, [#1885](https://github.com/Rdatatable/data.table/issues/1885). For details see `?.Last.updated`. +8. `between()` and `%between%` are faster for `POSIXct`, [#3519](https://github.com/Rdatatable/data.table/issues/3519), and now support the `.()` alias, [#2315](https://github.com/Rdatatable/data.table/issues/2315). Thanks to @Henrik-P for the reports. There is now also support for `bit64`'s `integer64` class and more robust coercion of types, [#3517](https://github.com/Rdatatable/data.table/issues/3517). + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. diff --git a/R/between.R b/R/between.R index 759ab6dff..a32117f4f 100644 --- a/R/between.R +++ b/R/between.R @@ -3,23 +3,67 @@ between <- function(x,lower,upper,incbounds=TRUE) { if (is.logical(x)) stop("between has been x of type logical") if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) - is_strictly_numeric <- function(x) is.numeric(x) && !inherits(x, "integer64") - if (is_strictly_numeric(x) && is_strictly_numeric(lower) && is_strictly_numeric(upper)) { + is.px = function(x) inherits(x, "POSIXct") + is.i64 = function(x) inherits(x, "integer64") + # POSIX special handling to auto coerce character + if (is.px(x) && !is.null(tz<-attr(x, "tzone", TRUE)) && nzchar(tz) && + (is.character(lower) || is.character(upper))) { + try_posix_cast <- function(x, tz) {tryCatch( + list(status=0L, value=as.POSIXct(x, tz = tz)), + error = function(e) list(status=1L, value=NULL, message=e[["message"]]) + )} + if (is.character(lower)) { + ans = try_posix_cast(lower, tz) + if (ans$status==0L) lower = ans$value + else stop("'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", ans$message) + } + if (is.character(upper)) { + ans = try_posix_cast(upper, tz) + if (ans$status==0L) upper = ans$value + else stop("'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", ans$message) + } + stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal + } + # POSIX check timezone match + if (is.px(x) && is.px(lower) && is.px(upper)) { + tz_match = function(x, y, z) { # NULL match "", else all identical + ((is.null(x) || !nzchar(x)) && (is.null(y) || !nzchar(y)) && (is.null(z) || !nzchar(z))) || + (identical(x, y) && identical(x, z)) + } + if (!tz_match(attr(x, "tzone", TRUE), attr(lower, "tzone", TRUE), attr(upper, "tzone", TRUE))) { + stop("'between' function arguments have mismatched timezone attribute, align all arguments to same timezone") + } + } + # int64 + if (is.i64(x)) { + if (!requireNamespace("bit64", quietly=TRUE)) stop("trying to use integer64 class when 'bit64' package is not installed") # nocov + if (!is.i64(lower) && is.numeric(lower)) lower = bit64::as.integer64(lower) + if (!is.i64(upper) && is.numeric(upper)) upper = bit64::as.integer64(upper) + } else if (is.i64(lower) || is.i64(upper)) { + stop("'lower' and/or 'upper' are integer64 class while 'x' argument is not, align classes before passing to 'between'") + } + is.supported = function(x) is.numeric(x) || is.px(x) + if (is.supported(x) && is.supported(lower) && is.supported(upper)) { # faster parallelised version for int/double. # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). # length(upper) can be 1 or length(x) independently of lower .Call(Cbetween, x, lower, upper, incbounds) } else { + if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") # now just for character input. TODO: support character between in Cbetween and remove this branch - if(incbounds) x>=lower & x<=upper + if (incbounds) x>=lower & x<=upper else x>lower & x=,<= and >,< - verbose = getOption("datatable.verbose") + verbose = isTRUE(getOption("datatable.verbose")) if (verbose) {last.started.at=proc.time();cat("forderv(query) took ... ");flush.console()} xo = forderv(query) if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 13925f097..3521b397f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14479,6 +14479,95 @@ test(2037.2, names(DT), 'a') # _selrefok() verbose message was duplicated test(2037.3, unname(table(unlist(strsplit(capture.output(foo(DT)), '\n|\\s+')))['ptr']), 1L) +# `between` invalid args, and verbose #3516 +test(2038.01, between(1:5, 2, 4, incbounds=423), error="incbounds must be logical") +test(2038.02, between(1:5, 2, 4, incbounds=NA), error="incbounds must be logical") +old = options(datatable.verbose=TRUE) +test(2038.03, between(1:5, 2L, 4L), output="between parallel processing of integer with recycling took") +test(2038.04, between(1:5, rep(2L,5L), rep(4L, 5L)), output="between parallel processing of integer took") +test(2038.05, between(as.double(1:5), 2, 4, incbounds=FALSE), output="between parallel processing of double using open bounds with recycling took") +test(2038.06, between(as.double(1:5), 2, 4), output="between parallel processing of double using closed bounds with recycling took") +test(2038.07, between(as.double(1:5), rep(2, 5L), rep(4, 5L)), output="between parallel processing of double took") +test(2038.08, between(c("foo","bar","paz"), "bag", "fog"), output="optimised between not available for this data type, fallback to slow R routine") + +# `between` handle POSIXct type +x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 +dn = as.POSIXct('2016-09-18 08:00:00') +up = as.POSIXct('2016-09-18 09:00:00') +test(2038.09, between(x, dn, up), output="between parallel processing of double using closed bounds with recycling took") +test(2038.10, between(x, dn, up, incbounds=FALSE), output="between parallel processing of double using open bounds with recycling took") + +# also handling of string lower/upper bounds _only when x has a time zone_ +x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 +dn = '2016-09-18 08:00:00' +up = '2016-09-18 09:00:00' +test(2038.11, between(x, dn, up), output = 'optimised between not available') +attr(x, 'tzone') = 'UTC' +test(2038.12, between(x, dn, up), output = 'between parallel processing of double') + +# additional flexibility -- cast when one bound is already POSIXct +up = as.POSIXct(up, tz="UTC") +test(2038.13, between(x, dn, up), output = 'between parallel processing of double') +options(old) + +# exceptions in char to POSIX coercion +dn = 'aa2016-09-18 08:00:00' +test(2038.14, between(x, dn, up), error="coercion to POSIX failed") +dn = '2016-09-18 08:00:00' +up = 'bb2016-09-18 09:00:00' +test(2038.15, between(x, dn, up), error="coercion to POSIX failed") + +# exceptions due to timezone mismatch +x = as.POSIXct("2016-09-18 07:00:00", tz="UTC") + 0:10*60*15 +dn = as.POSIXct('2016-09-18 08:00:00') +up = '2016-09-18 09:00:00' +test(2038.16, between(x, dn, up), error="mismatched timezone attribute") + +# `between` support `.` in RHS #2315 +X = data.table(a = 1:5, b = 6:10, c = c(5:1)) +test(2038.17, X[c %between% list(a, b)], X[c %between% .(a, b)]) + +# between num to int coercion #3517 +old = options("datatable.verbose"=TRUE) +test(2038.18, between(1:5, 2, 4), output="between parallel processing of integer with recycling") +test(2038.19, between(1:5, 2L, 4), output="between parallel processing of integer with recycling") +test(2038.20, between(1:5, 2, 4L), output="between parallel processing of integer with recycling") +options(old) + +# between int64 support +if (test_bit64) { + as.i64 = bit64::as.integer64 + test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="are integer64 class while 'x' argument is not") + test(2039.02, between(1:10, 3, as.i64(6)), error="are integer64 class while 'x' argument is not") + old = options("datatable.verbose"=TRUE) + x = as.i64(1:10) + ans36 = c(FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) + ans36open = c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE) + test(2039.03, between(x, 3, 6), ans36, output="between parallel processing of integer64 with recycling") + test(2039.04, between(x, 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2039.05, between(x, 3L, 6), ans36, output="between parallel processing of integer64 with recycling") + test(2039.06, between(x, 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2039.07, between(x, 3, 6L), ans36, output="between parallel processing of integer64 with recycling") + test(2039.08, between(x, 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2039.09, between(x, 3L, 6L), ans36, output="between parallel processing of integer64 with recycling") + test(2039.10, between(x, 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2039.11, between(x, rep(3, 10L), rep(6, 10L)), ans36, output="between parallel processing of integer64 took") + test(2039.12, between(x, rep(3, 10L), rep(6, 10L), incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + maxint = 2147483647 + test(2039.13, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 with recycling") + test(2039.14, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + x[5] = NA + ans36[5] = NA + ans36open[5] = NA + test(2039.15, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 with recycling") + test(2039.16, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2039.17, between(x+maxint, NA, 6+maxint), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 with recycling") + test(2039.18, between(x+maxint, 3+maxint, NA, incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 with recycling") + test(2039.19, between(x+maxint, rep(NA, 10L), rep(6+maxint, 10L)), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") + test(2039.20, between(x+maxint, rep(3+maxint, 10L), rep(NA, 10L), incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") + options(old) +} + ################################### # Add new tests above this line # diff --git a/src/between.c b/src/between.c index c5b97aecc..361bf70a0 100644 --- a/src/between.c +++ b/src/between.c @@ -1,8 +1,21 @@ #include "data.table.h" +bool isRealReallyInt(SEXP x) { + if (!isReal(x)) return(false); + R_xlen_t n=xlength(x), i=0; + double *dx = REAL(x); + while (iupper or lower==INT_MAX @@ -41,16 +71,19 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { if (!recycleX && recycleLow && recycleUpp) { const int l = lp[0] + open; // +open so we can always use >= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) const int u = up[0]==NA_INTEGER ? INT_MAX : up[0] - open; + if (verbose) tic = omp_get_wtime(); #pragma omp parallel for num_threads(getDTthreads()) for (int i=0; i(b))?(a):(b)) +// for use with bit64::integer64 +#define NA_INTEGER64 INT64_MIN +#define MAX_INTEGER64 INT64_MAX + // Backport macros added to R in 2017 so we don't need to update dependency from R 3.0.0 #ifndef MAYBE_SHARED # define MAYBE_SHARED(x) (NAMED(x) > 1) @@ -79,9 +83,11 @@ SEXP sym_BY; SEXP sym_starts, char_starts; SEXP sym_maxgrpn; SEXP sym_colClassesAs; +SEXP sym_verbose; bool INHERITS(SEXP x, SEXP char_); long long DtoLL(double x); double LLtoD(long long x); +bool GetVerbose(); double NA_INT64_D; long long NA_INT64_LL; @@ -183,3 +189,6 @@ SEXP colnamesInt(SEXP x, SEXP cols); void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose); void nafillInteger(int32_t *x, uint_fast64_t nx, unsigned int type, int32_t fill, ans_t *ans, bool verbose); SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose); + +// between.c +SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds); diff --git a/src/forder.c b/src/forder.c index 5b8030e19..c507aacc2 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1275,9 +1275,10 @@ SEXP isReallyReal(SEXP x) { // which needed a repeat of the argument. Hence simpler and more robust to return 0 when not type double. if (isReal(x)) { int n=length(x), i=0; + double *dx = REAL(x); while (i