Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for enumerated types #562

Merged
merged 27 commits into from Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
453621b
Support for factor columns via (experimental) Dictionary support
eddelbuettel Mar 30, 2023
12dfb43
Disable CI as we 'cannot do this' without pending Dictionary support
eddelbuettel Mar 30, 2023
fe00fab
More efficient factor creation
eddelbuettel Mar 31, 2023
0d72a26
Snapshot with updated enumeration class
eddelbuettel Jun 16, 2023
7b1742c
Updated snapshot with working read of enum'ed array
eddelbuettel Jun 20, 2023
9fb64d0
Updated snapshot passing all tests
eddelbuettel Jun 21, 2023
4095327
Snapshot writing with factor
eddelbuettel Jun 22, 2023
71e86c3
Snapshot displaying schema with enumeration
eddelbuettel Jun 22, 2023
45568f8
On index columns, factors need as character
eddelbuettel Jun 23, 2023
2fb6d28
Recover factor from shmem buffer too
eddelbuettel Jun 23, 2023
463e7ca
With unit test support
eddelbuettel Jun 23, 2023
7edecba
With updated documentation to pass R CMD check
eddelbuettel Jun 23, 2023
175d561
Snapshot enumeration to column buffer
eddelbuettel Jun 28, 2023
0eba654
Further snapshot enumeration to column buffer
eddelbuettel Jun 29, 2023
976fe51
Snapshot with Dictionary re-creation at R level
eddelbuettel Jun 30, 2023
b7b5056
Snapshot with working factor creation on the c++ side
eddelbuettel Jul 2, 2023
583fd18
Dictionaries are small strings using int32 offsets
eddelbuettel Jul 3, 2023
473a71f
Cleanup snapshot commenting out some unused code
eddelbuettel Jul 3, 2023
9cdf65e
Snapshot and refactor, focus on arrowio not column_buffer
eddelbuettel Jul 3, 2023
1728b87
Snapshot with small tweaks and one new helper
eddelbuettel Jul 8, 2023
1f36172
Schema from uri now accesses array to allow attribute enumerations
eddelbuettel Jul 10, 2023
2bee495
Condition enumeration code on TileDB 2.17.0 or later
eddelbuettel Jul 21, 2023
8e2ffc8
Adjust tests for enumerations
eddelbuettel Jul 21, 2023
fd80ad6
Correction for array open given key
eddelbuettel Jul 21, 2023
e333414
Re-enable continuous integration
eddelbuettel Jul 21, 2023
2496e40
Add support for query conditions on enumerations
eddelbuettel Jul 31, 2023
7e29877
Roll micro version and update NEWS [ci skip]
eddelbuettel Sep 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
@@ -1,6 +1,6 @@
Package: tiledb
Type: Package
Version: 0.20.3.2
Version: 0.20.3.3
Title: Universal Storage Engine for Sparse and Dense Multidimensional Arrays
Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")),
person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre"))
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Expand Up @@ -88,6 +88,7 @@ export(tiledb_array_create)
export(tiledb_array_delete_fragments)
export(tiledb_array_get_non_empty_domain_from_index)
export(tiledb_array_get_non_empty_domain_from_name)
export(tiledb_array_has_enumeration)
export(tiledb_array_is_heterogeneous)
export(tiledb_array_is_homogeneous)
export(tiledb_array_is_open)
Expand All @@ -114,10 +115,14 @@ export(tiledb_arrow_schema_ptr)
export(tiledb_attr)
export(tiledb_attribute_get_cell_size)
export(tiledb_attribute_get_cell_val_num)
export(tiledb_attribute_get_enumeration)
export(tiledb_attribute_get_enumeration_ptr)
export(tiledb_attribute_get_fill_value)
export(tiledb_attribute_get_nullable)
export(tiledb_attribute_has_enumeration)
export(tiledb_attribute_is_variable_sized)
export(tiledb_attribute_set_cell_val_num)
export(tiledb_attribute_set_enumeration_name)
export(tiledb_attribute_set_fill_value)
export(tiledb_attribute_set_nullable)
export(tiledb_config)
Expand Down Expand Up @@ -215,6 +220,7 @@ export(tiledb_query_buffer_alloc_ptr)
export(tiledb_query_condition)
export(tiledb_query_condition_combine)
export(tiledb_query_condition_init)
export(tiledb_query_condition_set_use_enumeration)
export(tiledb_query_create_buffer_ptr)
export(tiledb_query_create_buffer_ptr_char)
export(tiledb_query_ctx)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Expand Up @@ -6,6 +6,8 @@

* Built-time configuration of TileDB Embedded can now be accessed as a JSON string (#584)

* Enumeration types (i.e. what R calls factors) are now supported (#562)


# tiledb 0.20.3

Expand Down
17 changes: 16 additions & 1 deletion R/Array.R
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2017-2021 TileDB Inc.
# Copyright (c) 2017-2023 TileDB Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -161,3 +161,18 @@ tiledb_array_delete_fragments <- function(arr, ts_start, ts_end) {
libtiledb_array_delete_fragments(arr@ptr, ts_start, ts_end)
invisible(TRUE)
}

##' Check for Enumeration (aka Factor aka Dictionary)
##'
##' @param arr A TileDB Array object
##' @return A boolean indicating if the array has homogeneous domains
##' @export
tiledb_array_has_enumeration <- function(arr) {
stopifnot("The 'arr' argument must be a tiledb_array object" = .isArray(arr))
ctx <- tiledb_get_context()
if (!tiledb_array_is_open(arr)) {
arr <- tiledb_array_open(arr, "READ")
on.exit(tiledb_array_close(arr))
}
return(libtiledb_array_has_enumeration_vector(ctx@ptr, arr@ptr))
}
42 changes: 30 additions & 12 deletions R/ArraySchema.R
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2017-2022 TileDB Inc.
# Copyright (c) 2017-2023 TileDB Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand All @@ -23,13 +23,16 @@
#' An S4 class for the TileDB array schema
#'
#' @slot ptr An external pointer to the underlying implementation
#' @slot arrptr An optional external pointer to the underlying array, or NULL if missing
#' @exportClass tiledb_array_schema
setClass("tiledb_array_schema",
slots = list(ptr = "externalptr"))
slots = list(ptr = "externalptr",
arrptr = "ANY"))

tiledb_array_schema.from_ptr <- function(ptr) {
stopifnot(`The 'ptr' argument must be a non NULL externalptr to a tiledb_array_schema instance` = !missing(ptr) && is(ptr, "externalptr") && !is.null(ptr) )
new("tiledb_array_schema", ptr = ptr)
tiledb_array_schema.from_ptr <- function(ptr, arrptr=NULL) {
stopifnot("The 'ptr' argument must be an external pointer to a tiledb_array_schema instance"
= !missing(ptr) && is(ptr, "externalptr") && !is.null(ptr))
new("tiledb_array_schema", ptr = ptr, arrptr = arrptr)
}

#' Constructs a `tiledb_array_schema` object
Expand All @@ -43,7 +46,8 @@ tiledb_array_schema.from_ptr <- function(ptr) {
#' @param offsets_filter_list (optional)
#' @param validity_filter_list (optional)
#' @param capacity (optional)
#' @param allows_dups (optional, requires \sQuote{spars} to be TRUE)
#' @param allows_dups (optional, requires \sQuote{sparse} to be TRUE)
#' @param enumerations (optional) named list of enumerations
#' @param ctx tiledb_ctx object (optional)
#' @examples
#' \dontshow{ctx <- tiledb_ctx(limitTileDBCores())}
Expand All @@ -68,6 +72,7 @@ tiledb_array_schema <- function(domain,
validity_filter_list = NULL,
capacity = 10000L,
allows_dups = FALSE,
enumerations = NULL,
ctx = tiledb_get_context()) {
if (!missing(attrs) && length(attrs) != 0) {
is_attr <- function(obj) is(obj, "tiledb_attr")
Expand Down Expand Up @@ -97,7 +102,7 @@ tiledb_array_schema <- function(domain,

ptr <- libtiledb_array_schema(ctx@ptr, domain@ptr, attr_ptr_list, cell_order, tile_order,
coords_filter_list_ptr, offsets_filter_list_ptr,
validity_filter_list_ptr, sparse)
validity_filter_list_ptr, sparse, enumerations)
libtiledb_array_schema_set_capacity(ptr, capacity)
if (allows_dups) libtiledb_array_schema_set_allows_dups(ptr, TRUE)
invisible(new("tiledb_array_schema", ptr = ptr))
Expand Down Expand Up @@ -145,7 +150,7 @@ setMethod("show", signature(object = "tiledb_array_schema"),
nfo <- nfilters(fl$offsets)
nfv <- nfilters(fl$validity)
cat("tiledb_array_schema(\n domain=", .as_text_domain(domain(object)), ",\n",
" attrs=c(\n ", paste(sapply(attrs(object), .as_text_attribute), collapse=",\n "), "\n ),\n",
" attrs=c(\n ", paste(sapply(attrs(object), .as_text_attribute, arrptr=object@arrptr), collapse=",\n "), "\n ),\n",
" cell_order=\"", cell_order(object), "\", ",
"tile_order=\"", tile_order(object), "\", ",
"capacity=", capacity(object), ", ",
Expand All @@ -155,11 +160,8 @@ setMethod("show", signature(object = "tiledb_array_schema"),
sep="")
if (nfc > 0) cat(" coords_filter_list=", .as_text_filter_list(fl$coords), if (nfo + nfv > 0) "," else "", "\n", sep="")
if (nfo > 0) cat(" offsets_filter_list=", .as_text_filter_list(fl$offsets), if (nfv > 0) ",\n" else "", sep="")
if (nfv > 0)
cat(" validity_filter_list=", .as_text_filter_list(fl$validity), "\n", sep="")
if (nfv > 0) cat(" validity_filter_list=", .as_text_filter_list(fl$validity), "\n", sep="")
cat(")\n", sep="")
#cat("tiledb_array_create(uri=tempfile(), schema=sch)) # or assign your URI here\n")

})

#' @rdname generics
Expand Down Expand Up @@ -541,6 +543,22 @@ tiledb_schema_get_dim_attr_status <- function(sch) {
return(c(rep(1L, length(dims)), rep(2L, length(attrs))))
}

##' Get Dimension or Attribute Status
##'
##' Note that this function is an unexported internal function.
##'
##' @param sch A TileDB Schema object
##' @return An integer vector where each element corresponds to a schema entry,
##' and a value of one signals dimension and a value of two an attribute.
tiledb_schema_get_enumeration_status <- function(sch) {
stopifnot("The 'sch' argument must be a schema" = is(sch, "tiledb_array_schema"))
dom <- tiledb::domain(sch)
dims <- tiledb::dimensions(dom)
attrs <- tiledb::attrs(sch)
return(c(rep(FALSE, length(dims)),
sapply(attrs, tiledb_attribute_has_enumeration)))
}


# -- get and set tile capacity

Expand Down
68 changes: 65 additions & 3 deletions R/Attribute.R
Expand Up @@ -42,6 +42,7 @@ tiledb_attr.from_ptr <- function(ptr) {
#' @param ncells (default 1) The number of cells, use \code{NA} to signal variable length
#' @param nullable (default FALSE) A logical switch whether the attribute can have missing
#' values
#' @param enumeration (default NULL) A character vector of dictionary values
#' @param ctx tiledb_ctx object (optional)
#' @return `tiledb_dim` object
#' @examples
Expand All @@ -58,6 +59,7 @@ tiledb_attr <- function(name,
filter_list = tiledb_filter_list(),
ncells = 1,
nullable = FALSE,
enumeration = NULL,
ctx = tiledb_get_context()
) {
if (missing(name)) name <- ""
Expand All @@ -66,7 +68,10 @@ tiledb_attr <- function(name,
`The 'name' argument must be a scalar string` = is.scalar(name, "character"),
`The 'filter_list' argument must be a tiledb_filter_list instance` = is(filter_list, "tiledb_filter_list"))
ptr <- libtiledb_attribute(ctx@ptr, name, type, filter_list@ptr, ncells, nullable)
new("tiledb_attr", ptr = ptr)
attr <- new("tiledb_attr", ptr = ptr)
if (!is.null(enumeration))
attr <- tiledb_attribute_set_enumeration_name(attr, name, ctx)
invisible(attr)
}

#' Raw display of an attribute object
Expand All @@ -81,13 +86,23 @@ setMethod("raw_dump",
definition = function(object) libtiledb_attribute_dump(object@ptr))

# internal function returning text use here and in other higher-level show() methods
.as_text_attribute <- function(object) {
.as_text_attribute <- function(object, arrptr=NULL) {
fl <- filter_list(object)
ndct <- 0 # default
dct <- character() # default
if (!is.null(arrptr)) {
if (!libtiledb_array_is_open_for_reading(arrptr)) arrptr <- libtiledb_array_open_with_ptr(arrptr, "READ")
if (tiledb_attribute_has_enumeration(object)) {
dct <- tiledb_attribute_get_enumeration_ptr(object, arrptr)
ndct <- length(dct)
}
}
txt <- paste0("tiledb_attr(name=\"", name(object), "\", ",
"type=\"", datatype(object), "\", ",
"ncells=", cell_val_num(object), ", ",
"nullable=", tiledb_attribute_get_nullable(object),
if (nfilters(fl) > 0) paste0(", filter_list=", .as_text_filter_list(fl)))
if (nfilters(fl) > 0) paste0(", filter_list=", .as_text_filter_list(fl)),
if (ndct > 0) paste0(", dictionary=c(\"", paste(dct[seq(1, min(5, ndct))], collapse="\",\""), if (ndct > 5) "\",...", "\")"))
txt <- paste0(txt, ")")
txt
}
Expand Down Expand Up @@ -313,3 +328,50 @@ tiledb_attribute_get_nullable <- function(attr) {
stopifnot(`The argument must be an attribute` = is(attr, "tiledb_attr"))
libtiledb_attribute_get_nullable(attr@ptr)
}

#' Test if TileDB Attribute has an Enumeration
#'
#' @param attr A TileDB Attribute object
#' @param ctx A Tiledb Context object (optional)
#' @return A logical value indicating if the attribute has an enumeration
#' @export
tiledb_attribute_has_enumeration <- function(attr, ctx = tiledb_get_context()) {
stopifnot("The 'attr' argument must be an attribute" = is(attr, "tiledb_attr"))
libtiledb_attribute_has_enumeration(ctx@ptr, attr@ptr)
}

#' Get the TileDB Attribute Enumeration
#'
#' @param attr A TileDB Attribute object
#' @param arr A Tiledb Array object
#' @param ctx A Tiledb Context object (optional)
#' @return A character vector with the enumeration (of length zero if none)
#' @export
tiledb_attribute_get_enumeration <- function(attr, arr, ctx = tiledb_get_context()) {
stopifnot("The 'attr' argument must be an attribute" = is(attr, "tiledb_attr"),
"The 'arr' argument must be an array" = is(arr, "tiledb_array"))
libtiledb_attribute_get_enumeration(ctx@ptr, attr@ptr, arr@ptr)
}

#' @rdname tiledb_attribute_get_enumeration
#' @param arrptr A Tiledb Array object pointer
#' @export
tiledb_attribute_get_enumeration_ptr <- function(attr, arrptr, ctx = tiledb_get_context()) {
stopifnot("The 'attr' argument must be an attribute" = is(attr, "tiledb_attr"),
"The 'arr' argument must be an external pointer" = is(arrptr, "externalptr"))
libtiledb_attribute_get_enumeration(ctx@ptr, attr@ptr, arrptr)
}

#' Set a TileDB Attribute Enumeration Name
#'
#' @param attr A TileDB Attribute object
#' @param enum_name A character value with the enumeration value
#' @param ctx A Tiledb Context object (optional)
#' @return The modified TileDB Attribute object
#' @export
tiledb_attribute_set_enumeration_name <- function(attr, enum_name, ctx = tiledb_get_context()) {
stopifnot("The 'attr' argument must be an attribute" = is(attr, "tiledb_attr"),
"The 'enum_name' argument must be character" = is.character(enum_name))
attr@ptr <- libtiledb_attribute_set_enumeration(ctx@ptr, attr@ptr, enum_name)
attr
}
49 changes: 38 additions & 11 deletions R/DataFrame.R
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2017-2022 TileDB Inc.
# Copyright (c) 2017-2023 TileDB Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -97,9 +97,12 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
if (class(obj)[1] != "data.frame") obj <- as.data.frame(obj)

## turn factor columns in char columns
factcols <- grep("factor", sapply(obj, class))
if (length(factcols) > 0) {
for (i in factcols) obj[,i] <- as.character(obj[,i])
## TODO: add an option
if (tiledb_version(TRUE) < "2.17.0") {
factcols <- grep("factor", sapply(obj, class))
if (length(factcols) > 0) {
for (i in factcols) obj[,i] <- as.character(obj[,i])
}
}

## Create default filter_list from filter vector, 'NONE' and 'ZSTD' is default
Expand Down Expand Up @@ -127,6 +130,7 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
makeDim <- function(ind) {
idxcol <- dimobj[,ind]
idxnam <- colnames(dimobj)[ind]
if (inherits(idxcol, "factor")) idxcol <- as.character(idxcol)
col_domain <- if (is.null(tile_domain)) { # default case
c(min(idxcol), max(idxcol)) # use range
} else if (is.list(tile_domain)) { # but if list
Expand Down Expand Up @@ -190,9 +194,14 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
dom <- tiledb_domain(dims = dimensions)
}

## the simple helper function used create attribute_i given index i
## we now make it a little bit more powerful yet clumsy but returning a
## three element list at each element where the list contains the attribute
## along with the optional factor levels vector (and the corresponding column name)
makeAttr <- function(ind) {
col <- obj[,ind]
colname <- colnames(obj)[ind]
lvls <- NULL # by default no factor levels
if (inherits(col, "AsIs")) {
## we just look at the first list column, others have to have same type and length
cl <- class(obj[,ind][[1]])
Expand All @@ -217,8 +226,15 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
tp <- "INT64"
else if (cl == "logical")
tp <- if (tiledb_version(TRUE) >= "2.10.0") "BOOL" else "INT32"
else if (cl == "factor") {
lvls <- levels(col) # extract factor levels
if (length(lvls) > .Machine$integer.max)
stop("Cannot represent this many levels for ", colname, call. = FALSE)
tp <- "INT32"
}
else
stop("Currently unsupported type: ", cl)

filters <- if (colname %in% names(filter_list)) {
tiledb_filter_list(sapply(filter_list[[colname]], tiledb_filter))
} else {
Expand All @@ -227,15 +243,24 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
if (debug) {
cat(sprintf("Setting attribute name %s type %s\n", colname, tp))
}
tiledb_attr(colname,
type = tp,
ncells = if (tp %in% c("CHAR","ASCII")) NA_integer_ else nc,
filter_list = filters,
nullable = any(is.na(col)))
attr <- tiledb_attr(colname,
type = tp,
ncells = if (tp %in% c("CHAR","ASCII")) NA_integer_ else nc,
filter_list = filters,
nullable = any(is.na(col)),
enumeration = lvls)
list(attr=attr, lvls=lvls, name=colname)
}
cols <- seq_len(dims[2])
if (!is.null(col_index)) cols <- cols[-col_index]
attributes <- if (length(cols) > 0) sapply(cols, makeAttr) else list()
attributes <- enumerations <- list() # fallback
if (length(cols) > 0) {
a_e <- lapply(cols, makeAttr)
attributes <- lapply(a_e, "[[", 1)
enumerations <- lapply(a_e, "[[", 2)
colnames <- lapply(a_e, "[[", 3)
names(enumerations) <- colnames
}
schema <- tiledb_array_schema(dom,
attrs = attributes,
cell_order = cell_order,
Expand All @@ -244,8 +269,10 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
coords_filter_list = tiledb_filter_list(sapply(coords_filters, tiledb_filter)),
offsets_filter_list = tiledb_filter_list(sapply(offsets_filters, tiledb_filter)),
validity_filter_list = tiledb_filter_list(sapply(validity_filters, tiledb_filter)),
capacity=capacity)
capacity = capacity,
enumerations = if (length(enumerations) > 0) enumerations else NULL)
allows_dups(schema) <- allows_dups

if (mode != "append")
tiledb_array_create(uri, schema)

Expand Down