From 0145503c7a079c2f5c3c0229b2b04f3baf021631 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 29 Sep 2023 17:09:37 -0500 Subject: [PATCH] Extend parse_query_condition to use new 'IN' and 'NOT_IN' queries (#598) * Extend parse_query_condition to use new 'IN' and 'NOT_IN' queries * Three more removal of temporarily added array info to parse calls * Additional tests * Update NEWS and roll micro version [ci skip] * Refine one check for 2.17.0 --- DESCRIPTION | 2 +- NEWS.md | 2 + R/QueryCondition.R | 49 ++++++++++++++++++++----- inst/tinytest/test_querycondition.R | 57 ++++++++++++++++++++++++++++- man/parse_query_condition.Rd | 11 +++++- 5 files changed, 108 insertions(+), 13 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d8e1bf555e..960aa78f87 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tiledb Type: Package -Version: 0.21.1.2 +Version: 0.21.1.3 Title: Universal Storage Engine for Sparse and Dense Multidimensional Arrays Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")), person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre")) diff --git a/NEWS.md b/NEWS.md index ac9f1d0ecc..ab772e6101 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ * Set conditions are supported in query condition expressions (#597) +* Query conditions expression parsing via `parse_query_conditions` was extended simmilarly (#598) + ## Bug Fixes * The DESCRIPTION file now correctly refers to macOS 10.14 (#596) diff --git a/R/QueryCondition.R b/R/QueryCondition.R index 514d2c9b2b..967389cc7b 100644 --- a/R/QueryCondition.R +++ b/R/QueryCondition.R @@ -102,8 +102,14 @@ tiledb_query_condition_combine <- function(lhs, rhs, op) { #' Create a 'tiledb_query_condition' object from an expression #' -#' The grammar for query conditions is at present constraint to six operators -#' and three boolean types. +#' The grammar for query conditions is at present constraint to eight operators (\code{">"}, +#' \code{">="}, \code{"<"}, \code{"<="}, \code{"=="}, \code{"!="}, \code{"%in%"}, \code{"%nin%"}), +#' and three boolean operators (\code{"&&"}, also as \code{"&"}, (\code{"||"}, also as \code{"|"}, +#' and \code{"!"} for negation. Note that we locally define \code{"%nin%"} as \code{Negate()} call +#' around \code{%in%)} which extends R a little for this use case. +#' +#' Expressions are parsed locally by this function. The \code{debug=TRUE} option may help in an issue +#' has to be diagnosed. #' #' @param expr An expression that is understood by the TileDB grammar for query conditions. #' @param ta An optional tiledb_array object that the query condition is applied to @@ -127,14 +133,32 @@ tiledb_query_condition_combine <- function(lhs, rhs, op) { parse_query_condition <- function(expr, ta=NULL, debug=FALSE, strict=TRUE, use_int64=FALSE) { .hasArray <- !is.null(ta) && is(ta, "tiledb_array") if (.hasArray && length(ta@sil) == 0) ta@sil <- .fill_schema_info_list(ta@uri) - .isComparisonOperator <- function(x) tolower(as.character(x)) %in% c(">", ">=", "<", "<=", "==", "!=", "%in%") + `%!in%` <- Negate(`%in%`) + .isComparisonOperator <- function(x) tolower(as.character(x)) %in% c(">", ">=", "<", "<=", "==", "!=", "%in%", "%nin%") .isBooleanOperator <- function(x) as.character(x) %in% c("&&", "||", "!", "&", "|") .isAscii <- function(x) grepl("^[[:alnum:]_]+$", x) .isInteger <- function(x) grepl("^[[:digit:]]+$", as.character(x)) .isDouble <- function(x) grepl("^[[:digit:]\\.]+$", as.character(x)) && length(grepRaw(".", as.character(x), fixed = TRUE, all = TRUE)) == 1 - .isInOperator <- function(x) tolower(as.character(x)) == "%in%" + .isInOperator <- function(x) tolower(as.character(x)) %in% c("%in%", "%nin%") .errorFunction <- if (strict) stop else warning - .getType <- function(x, use_int64=FALSE) { + .getInd <- function(attr, ta) { + ind <- match(attr, ta@sil$names) + if (!is.finite(ind)) { + .errorFunction("No attribute '", attr, "' present.", call. = FALSE) + return(NULL) + } + if (ta@sil$status[ind] != 2) { + .errorFunction("Argument '", attr, "' is not an attribute.", call. = FALSE) + return(NULL) + } + ind + } + .getType <- function(x, tp, use_int64=FALSE) { + if (.hasArray) { + ind <- .getInd(tp, ta) + dtype <- ta@sil$types[ind] + return(dtype) + } if (isTRUE(.isInteger(x))) { if (use_int64) "INT64" else "INT32" } else if (isTRUE(.isDouble(x))) "FLOAT64" else "ASCII" @@ -174,14 +198,21 @@ parse_query_condition <- function(expr, ta=NULL, debug=FALSE, strict=TRUE, use_i " ", as.character(x[1]), " [", as.character(x[3]), "]\n", sep="") attr <- as.character(x[2]) + op <- tolower(as.character(x[1])) + tdbop <- if (op == "%in%") "IN" else "NOT_IN" + ind <- .getInd(attr, ta) + dtype <- ta@sil$types[ind] + is_enum <- ta@sil$enum[ind] vals <- eval(parse(text=as.character(x[3]))) - eqconds <- Map(.neweqcond, vals, attr) - orcond <- Reduce(.neworcond, eqconds) + if (dtype == "INT32" && !is_enum) vals <- if (use_int64) bit64::as.integer64(vals) else as.integer(vals) + return(tiledb_query_condition_create(attr, vals, tdbop)) + #eqconds <- Map(.neweqcond, vals, attr) + #orcond <- Reduce(.neworcond, eqconds) } else if (.isComparisonOperator(x[1])) { op <- as.character(x[1]) attr <- as.character(x[2]) ch <- as.character(x[3]) - dtype <- .getType(ch, use_int64) + dtype <- .getType(ch, attr, use_int64) is_enum <- FALSE # default is no if (.hasArray) { ind <- match(attr, ta@sil$names) @@ -262,7 +293,7 @@ tiledb_query_condition_create <- function(name, values, op = "IN", ctx = tiledb_ "Argument 'op' must be one of 'IN' or 'NOT_IN'" = op %in% c("IN", "NOT_IN"), "The 'ctx' argument must be a context object" = is(ctx, "tiledb_ctx"), "This function needs TileDB 2.17.0 or later" = tiledb_version(TRUE) >= "2.17.0") - ptr <- tiledb:::libtiledb_query_condition_create(ctx@ptr, name, values, op) + ptr <- libtiledb_query_condition_create(ctx@ptr, name, values, op) qc <- new("tiledb_query_condition", ptr = ptr, init = TRUE) invisible(qc) } diff --git a/inst/tinytest/test_querycondition.R b/inst/tinytest/test_querycondition.R index b51e4c9133..5cdfaff8ff 100644 --- a/inst/tinytest/test_querycondition.R +++ b/inst/tinytest/test_querycondition.R @@ -166,6 +166,7 @@ expect_true(all(res$year == 2009)) unlink(uri, recursive=TRUE) +## n=15 ## parse query condition support uri <- tempfile() fromDataFrame(penguins, uri, sparse=TRUE) @@ -183,7 +184,9 @@ expect_equal(NROW(res), 34L) expect_true(all(res$bill_length_mm < 40)) expect_true(all(res$year == 2009)) -if (tiledb_version(TRUE) >= "2.10.0") { # the OR operator is more recent than query conditions overall +## the OR operator is more recent than query conditions overall +## and this translates to the new-in-2.17.0 set version +if (tiledb_version(TRUE) >= "2.17.0") { qc3 <- parse_query_condition(island %in% c("Dream", "Biscoe"), arr) arrwithqc3 <- tiledb_array(uri, return_as="data.frame", strings_as_factors=TRUE, query_condition=qc3) res <- arrwithqc3[] @@ -483,3 +486,55 @@ expect_true(all(res$val >= as.integer64(6))) qc <- tiledb_query_condition_create("val", as.integer64(6:10), "NOT_IN") res <- tiledb_array(uri, return_as="data.frame", query_condition=qc)[] expect_true(all(res$val <= as.integer64(5))) + +## new parse tests +uri <- tempfile() +fromDataFrame(penguins, uri) +## %in% and %nin% +arr <- tiledb_array(uri) # to get the types correct we need array info +res <- tiledb_array(uri, return_as="data.frame", + query_condition=parse_query_condition(year %in% c(2007, 2009), arr))[] +expect_true(all(res$year != "2008")) + +res <- tiledb_array(uri, return_as="data.frame", + query_condition=parse_query_condition(year %nin% c(2007, 2009), arr))[] +expect_true(all(res$year == "2008")) + +## double +res <- tiledb_array(uri, return_as="data.frame", + query_condition=parse_query_condition(bill_length_mm %in% c(32.1,33.1,33.5),arr))[] +expect_true(all(res$bill_length_mm <= 33.5)) +expect_equal(nrow(res), 3) + +## Character (automagically converted from factor) +res <- tiledb_array(uri, return_as="data.frame", + query_condition=parse_query_condition(island %in% c("Biscoe", "Dream"), arr))[] +tt <- table(res$island) +expect_equal(tt[["Biscoe"]], 168) +expect_equal(tt[["Dream"]], 124) + +## Character (automagically converted from factor) +res <- tiledb_array(uri, return_as="data.frame", + query_condition=parse_query_condition(island %nin% c("Biscoe", "Dream"), arr))[] +tt <- table(res$island) +expect_equal(tt[["Torgersen"]], 52) + +## Combo +qc <- parse_query_condition(year %in% c(2007, 2009) && island %nin% c("Biscoe", "Dream"), arr) +res <- tiledb_array(uri, return_as="data.frame", query_condition=qc)[] +expect_true(all(res$year != "2008")) +expect_true(all(res$island == "Torgersen")) + + +## int64 +df <- data.frame(ind=1:10, val=as.integer64(1:10)) +uri <- tempfile() +fromDataFrame(df, uri) +arr <- tiledb_array(uri) +qc <- parse_query_condition(val %in% as.integer64(6:10), arr) +res <- tiledb_array(uri, return_as="data.frame", query_condition=qc)[] +expect_true(all(res$val >= as.integer64(6))) + +qc <- parse_query_condition(val %nin% as.integer64(6:10), arr) +res <- tiledb_array(uri, return_as="data.frame", query_condition=qc)[] +expect_true(all(res$val <= as.integer64(5))) diff --git a/man/parse_query_condition.Rd b/man/parse_query_condition.Rd index 5757fd6190..6a50296aa9 100644 --- a/man/parse_query_condition.Rd +++ b/man/parse_query_condition.Rd @@ -30,8 +30,15 @@ default is false to remain as a default four-byte \code{int}} A \code{tiledb_query_condition} object } \description{ -The grammar for query conditions is at present constraint to six operators -and three boolean types. +The grammar for query conditions is at present constraint to eight operators (\code{">"}, +\code{">="}, \code{"<"}, \code{"<="}, \code{"=="}, \code{"!="}, \code{"\%in\%"}, \code{"\%nin\%"}), +and three boolean operators (\code{"&&"}, also as \code{"&"}, (\code{"||"}, also as \code{"|"}, +and \code{"!"} for negation. Note that we locally define \code{"\%nin\%"} as \code{Negate()} call +around \code{\%in\%)} which extends R a little for this use case. +} +\details{ +Expressions are parsed locally by this function. The \code{debug=TRUE} option may help in an issue +has to be diagnosed. } \examples{ \dontshow{ctx <- tiledb_ctx(limitTileDBCores())}