Skip to content

Commit

Permalink
Extend parse_query_condition to use new 'IN' and 'NOT_IN' queries (#598)
Browse files Browse the repository at this point in the history
* Extend parse_query_condition to use new 'IN' and 'NOT_IN' queries

* Three more removal of temporarily added array info to parse calls

* Additional tests

* Update NEWS and roll micro version [ci skip]

* Refine one check for 2.17.0
  • Loading branch information
eddelbuettel committed Sep 29, 2023
1 parent ea8b912 commit 0145503
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 13 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tiledb
Type: Package
Version: 0.21.1.2
Version: 0.21.1.3
Title: Universal Storage Engine for Sparse and Dense Multidimensional Arrays
Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")),
person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre"))
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

* Set conditions are supported in query condition expressions (#597)

* Query conditions expression parsing via `parse_query_conditions` was extended simmilarly (#598)

## Bug Fixes

* The DESCRIPTION file now correctly refers to macOS 10.14 (#596)
Expand Down
49 changes: 40 additions & 9 deletions R/QueryCondition.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,14 @@ tiledb_query_condition_combine <- function(lhs, rhs, op) {

#' Create a 'tiledb_query_condition' object from an expression
#'
#' The grammar for query conditions is at present constraint to six operators
#' and three boolean types.
#' The grammar for query conditions is at present constraint to eight operators (\code{">"},
#' \code{">="}, \code{"<"}, \code{"<="}, \code{"=="}, \code{"!="}, \code{"%in%"}, \code{"%nin%"}),
#' and three boolean operators (\code{"&&"}, also as \code{"&"}, (\code{"||"}, also as \code{"|"},
#' and \code{"!"} for negation. Note that we locally define \code{"%nin%"} as \code{Negate()} call
#' around \code{%in%)} which extends R a little for this use case.
#'
#' Expressions are parsed locally by this function. The \code{debug=TRUE} option may help in an issue
#' has to be diagnosed.
#'
#' @param expr An expression that is understood by the TileDB grammar for query conditions.
#' @param ta An optional tiledb_array object that the query condition is applied to
Expand All @@ -127,14 +133,32 @@ tiledb_query_condition_combine <- function(lhs, rhs, op) {
parse_query_condition <- function(expr, ta=NULL, debug=FALSE, strict=TRUE, use_int64=FALSE) {
.hasArray <- !is.null(ta) && is(ta, "tiledb_array")
if (.hasArray && length(ta@sil) == 0) ta@sil <- .fill_schema_info_list(ta@uri)
.isComparisonOperator <- function(x) tolower(as.character(x)) %in% c(">", ">=", "<", "<=", "==", "!=", "%in%")
`%!in%` <- Negate(`%in%`)
.isComparisonOperator <- function(x) tolower(as.character(x)) %in% c(">", ">=", "<", "<=", "==", "!=", "%in%", "%nin%")
.isBooleanOperator <- function(x) as.character(x) %in% c("&&", "||", "!", "&", "|")
.isAscii <- function(x) grepl("^[[:alnum:]_]+$", x)
.isInteger <- function(x) grepl("^[[:digit:]]+$", as.character(x))
.isDouble <- function(x) grepl("^[[:digit:]\\.]+$", as.character(x)) && length(grepRaw(".", as.character(x), fixed = TRUE, all = TRUE)) == 1
.isInOperator <- function(x) tolower(as.character(x)) == "%in%"
.isInOperator <- function(x) tolower(as.character(x)) %in% c("%in%", "%nin%")
.errorFunction <- if (strict) stop else warning
.getType <- function(x, use_int64=FALSE) {
.getInd <- function(attr, ta) {
ind <- match(attr, ta@sil$names)
if (!is.finite(ind)) {
.errorFunction("No attribute '", attr, "' present.", call. = FALSE)
return(NULL)
}
if (ta@sil$status[ind] != 2) {
.errorFunction("Argument '", attr, "' is not an attribute.", call. = FALSE)
return(NULL)
}
ind
}
.getType <- function(x, tp, use_int64=FALSE) {
if (.hasArray) {
ind <- .getInd(tp, ta)
dtype <- ta@sil$types[ind]
return(dtype)
}
if (isTRUE(.isInteger(x))) { if (use_int64) "INT64" else "INT32" }
else if (isTRUE(.isDouble(x))) "FLOAT64"
else "ASCII"
Expand Down Expand Up @@ -174,14 +198,21 @@ parse_query_condition <- function(expr, ta=NULL, debug=FALSE, strict=TRUE, use_i
" ", as.character(x[1]),
" [", as.character(x[3]), "]\n", sep="")
attr <- as.character(x[2])
op <- tolower(as.character(x[1]))
tdbop <- if (op == "%in%") "IN" else "NOT_IN"
ind <- .getInd(attr, ta)
dtype <- ta@sil$types[ind]
is_enum <- ta@sil$enum[ind]
vals <- eval(parse(text=as.character(x[3])))
eqconds <- Map(.neweqcond, vals, attr)
orcond <- Reduce(.neworcond, eqconds)
if (dtype == "INT32" && !is_enum) vals <- if (use_int64) bit64::as.integer64(vals) else as.integer(vals)
return(tiledb_query_condition_create(attr, vals, tdbop))
#eqconds <- Map(.neweqcond, vals, attr)
#orcond <- Reduce(.neworcond, eqconds)
} else if (.isComparisonOperator(x[1])) {
op <- as.character(x[1])
attr <- as.character(x[2])
ch <- as.character(x[3])
dtype <- .getType(ch, use_int64)
dtype <- .getType(ch, attr, use_int64)
is_enum <- FALSE # default is no
if (.hasArray) {
ind <- match(attr, ta@sil$names)
Expand Down Expand Up @@ -262,7 +293,7 @@ tiledb_query_condition_create <- function(name, values, op = "IN", ctx = tiledb_
"Argument 'op' must be one of 'IN' or 'NOT_IN'" = op %in% c("IN", "NOT_IN"),
"The 'ctx' argument must be a context object" = is(ctx, "tiledb_ctx"),
"This function needs TileDB 2.17.0 or later" = tiledb_version(TRUE) >= "2.17.0")
ptr <- tiledb:::libtiledb_query_condition_create(ctx@ptr, name, values, op)
ptr <- libtiledb_query_condition_create(ctx@ptr, name, values, op)
qc <- new("tiledb_query_condition", ptr = ptr, init = TRUE)
invisible(qc)
}
57 changes: 56 additions & 1 deletion inst/tinytest/test_querycondition.R
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ expect_true(all(res$year == 2009))

unlink(uri, recursive=TRUE)

## n=15
## parse query condition support
uri <- tempfile()
fromDataFrame(penguins, uri, sparse=TRUE)
Expand All @@ -183,7 +184,9 @@ expect_equal(NROW(res), 34L)
expect_true(all(res$bill_length_mm < 40))
expect_true(all(res$year == 2009))

if (tiledb_version(TRUE) >= "2.10.0") { # the OR operator is more recent than query conditions overall
## the OR operator is more recent than query conditions overall
## and this translates to the new-in-2.17.0 set version
if (tiledb_version(TRUE) >= "2.17.0") {
qc3 <- parse_query_condition(island %in% c("Dream", "Biscoe"), arr)
arrwithqc3 <- tiledb_array(uri, return_as="data.frame", strings_as_factors=TRUE, query_condition=qc3)
res <- arrwithqc3[]
Expand Down Expand Up @@ -483,3 +486,55 @@ expect_true(all(res$val >= as.integer64(6)))
qc <- tiledb_query_condition_create("val", as.integer64(6:10), "NOT_IN")
res <- tiledb_array(uri, return_as="data.frame", query_condition=qc)[]
expect_true(all(res$val <= as.integer64(5)))

## new parse tests
uri <- tempfile()
fromDataFrame(penguins, uri)
## %in% and %nin%
arr <- tiledb_array(uri) # to get the types correct we need array info
res <- tiledb_array(uri, return_as="data.frame",
query_condition=parse_query_condition(year %in% c(2007, 2009), arr))[]
expect_true(all(res$year != "2008"))

res <- tiledb_array(uri, return_as="data.frame",
query_condition=parse_query_condition(year %nin% c(2007, 2009), arr))[]
expect_true(all(res$year == "2008"))

## double
res <- tiledb_array(uri, return_as="data.frame",
query_condition=parse_query_condition(bill_length_mm %in% c(32.1,33.1,33.5),arr))[]
expect_true(all(res$bill_length_mm <= 33.5))
expect_equal(nrow(res), 3)

## Character (automagically converted from factor)
res <- tiledb_array(uri, return_as="data.frame",
query_condition=parse_query_condition(island %in% c("Biscoe", "Dream"), arr))[]
tt <- table(res$island)
expect_equal(tt[["Biscoe"]], 168)
expect_equal(tt[["Dream"]], 124)

## Character (automagically converted from factor)
res <- tiledb_array(uri, return_as="data.frame",
query_condition=parse_query_condition(island %nin% c("Biscoe", "Dream"), arr))[]
tt <- table(res$island)
expect_equal(tt[["Torgersen"]], 52)

## Combo
qc <- parse_query_condition(year %in% c(2007, 2009) && island %nin% c("Biscoe", "Dream"), arr)
res <- tiledb_array(uri, return_as="data.frame", query_condition=qc)[]
expect_true(all(res$year != "2008"))
expect_true(all(res$island == "Torgersen"))


## int64
df <- data.frame(ind=1:10, val=as.integer64(1:10))
uri <- tempfile()
fromDataFrame(df, uri)
arr <- tiledb_array(uri)
qc <- parse_query_condition(val %in% as.integer64(6:10), arr)
res <- tiledb_array(uri, return_as="data.frame", query_condition=qc)[]
expect_true(all(res$val >= as.integer64(6)))

qc <- parse_query_condition(val %nin% as.integer64(6:10), arr)
res <- tiledb_array(uri, return_as="data.frame", query_condition=qc)[]
expect_true(all(res$val <= as.integer64(5)))
11 changes: 9 additions & 2 deletions man/parse_query_condition.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 0145503

Please sign in to comment.