From f449a6f38bf8e1e1ebda663e4953f682555bd07e Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 4 Jan 2022 09:05:14 -0600 Subject: [PATCH 1/4] update heuristic for variable size character data allocation --- R/TileDBArray.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/R/TileDBArray.R b/R/TileDBArray.R index 52ea3d44a5..dbf91385e0 100644 --- a/R/TileDBArray.R +++ b/R/TileDBArray.R @@ -517,6 +517,13 @@ setMethod("[", "tiledb_array", allnullable <- attrnullable } + cfg <- tiledb_config() + ## use an 'informed' guess for a memory budget 'per column' by scaling the + ## configuration limit by the number of columns (and to be a little conservative + ## we use 'memory_budget' and not 'memory_budget_var'. This heuristic could be + ## refined further as needed. + memory_budget <- as.numeric(unname(cfg["sm.memory_budget"])) / length(allnames) + if (length(enckey) > 0) { if (length(tstamp) > 0) { arrptr <- libtiledb_array_open_at_with_key(ctx@ptr, uri, "READ", enckey, tstamp) @@ -647,7 +654,6 @@ setMethod("[", "tiledb_array", ## get results (shmem variant) getResultShmem <- function(buf, name, varnum) { #, resrv, qryptr) { if (is.na(varnum)) { - ##vec <- libtiledb_query_result_buffer_elements_vec(qryptr, name) vec <- length_from_vlcbuf(buf) libtiledb_query_get_buffer_var_char(buf, vec[1], vec[2])[,1] } else { @@ -684,7 +690,9 @@ setMethod("[", "tiledb_array", getBuffer <- function(name, type, varnum, nullable, resrv, qryptr, arrptr) { if (is.na(varnum)) { if (type %in% c("CHAR", "ASCII", "UTF8")) { - buf <- libtiledb_query_buffer_var_char_alloc_direct(resrv, resrv*8, nullable) + memsz <- max(resrv*8, memory_budget/resrv) # larger of old value or budget/row + #cat("Running with resrv =", resrv, "and memsz =", memsz, "\n") + buf <- libtiledb_query_buffer_var_char_alloc_direct(resrv, memsz, nullable) qryptr <- libtiledb_query_set_buffer_var_char(qryptr, name, buf) buf } else { From 69f3bec62325730902cd7d6b0085f6af9d0bca49 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Mon, 10 Jan 2022 09:53:45 -0600 Subject: [PATCH 2/4] support a user-configurable memory budget size with fallback --- NAMESPACE | 4 ++ R/TileDBArray.R | 14 ++-- R/Utils.R | 88 +++++++++++++++++++++++++- R/zzz.R | 3 + inst/tinytest/test_dimsubset.R | 1 + man/save_allocation_size_preference.Rd | 51 +++++++++++++++ 6 files changed, 151 insertions(+), 10 deletions(-) create mode 100644 man/save_allocation_size_preference.Rd diff --git a/NAMESPACE b/NAMESPACE index 111de2a472..988cf30602 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,12 +45,14 @@ export(filter_list) export(fromDataFrame) export(fromMatrix) export(fromSparseMatrix) +export(get_allocation_size_preference) export(get_return_as_preference) export(has_attribute) export(is.anonymous) export(is.integral) export(is.sparse) export(limitTileDBCores) +export(load_allocation_size_preference) export(load_return_as_preference) export(max_chunk_size) export(name) @@ -64,9 +66,11 @@ export(return.array) export(return.data.frame) export(return.matrix) export(return_as) +export(save_allocation_size_preference) export(save_return_as_preference) export(schema) export(selected_ranges) +export(set_allocation_size_preference) export(set_max_chunk_size) export(set_return_as_preference) export(tdb_collect) diff --git a/R/TileDBArray.R b/R/TileDBArray.R index dbf91385e0..0fd4b76e69 100644 --- a/R/TileDBArray.R +++ b/R/TileDBArray.R @@ -517,12 +517,10 @@ setMethod("[", "tiledb_array", allnullable <- attrnullable } - cfg <- tiledb_config() - ## use an 'informed' guess for a memory budget 'per column' by scaling the - ## configuration limit by the number of columns (and to be a little conservative - ## we use 'memory_budget' and not 'memory_budget_var'. This heuristic could be - ## refined further as needed. - memory_budget <- as.numeric(unname(cfg["sm.memory_budget"])) / length(allnames) + ## A preference can be set in a local per-user configuration file; if not value + ## is set a fallback from the TileDB config object is used. As that value is fairly + ## large we scale by the number of columns not use this amount per buffer + memory_budget <- get_allocation_size_preference() / length(allnames) if (length(enckey) > 0) { if (length(tstamp) > 0) { @@ -690,9 +688,7 @@ setMethod("[", "tiledb_array", getBuffer <- function(name, type, varnum, nullable, resrv, qryptr, arrptr) { if (is.na(varnum)) { if (type %in% c("CHAR", "ASCII", "UTF8")) { - memsz <- max(resrv*8, memory_budget/resrv) # larger of old value or budget/row - #cat("Running with resrv =", resrv, "and memsz =", memsz, "\n") - buf <- libtiledb_query_buffer_var_char_alloc_direct(resrv, memsz, nullable) + buf <- libtiledb_query_buffer_var_char_alloc_direct(resrv, memory_budget, nullable) qryptr <- libtiledb_query_set_buffer_var_char(qryptr, name, buf) buf } else { diff --git a/R/Utils.R b/R/Utils.R index 94d5da9484..447077aab3 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -79,11 +79,97 @@ get_return_as_preference <- function() .pkgenv[["return_as"]] ##' @rdname save_return_as_preference ##' @export set_return_as_preference <- function(value = c("asis", "array", "matrix", "data.frame", - "data.table", "tibble")) { + "data.table", "tibble")) { value <- match.arg(value) .pkgenv[["return_as"]] <- value } + +##' Save (or load) allocation size default preference in an optional +##' config file +##' +##' When retrieving data from sparse arrays, allocation sizes cannot +##' be determined \emph{ex ante} as the degree of sparsity is unknown. +##' A configuration value can aide in providing an allocation size +##' value. These functions let the user store such a value for +##' retrieval by the package code. The preference will be enconded in +##' a configuration file as R (version 4.0.0 or later) allows a user- +##' and package specific configuration files. These helper functions +##' sets and retrieve the value, respectively, or retrieve the cached +##' value from the package environment where is it set at package +##' load. +##' +##' The value will be stored as a character value and reparsed so +##' \sQuote{1e6} and \sQuote{1000000} are equivalent, and the fixed +##' (but adjustable) number of digits for numerical precision +##' \emph{use for formatting} will impact the writing. This should +##' have no effect on standard allocation sizes. +##' +##' @note This function requires R version 4.0.0 or later to utilise the per-user +##' config directory accessor function. For older R versions, a fallback from the +##' TileDB configuration object is used. +##' @title Store allocation size preference +##' @param value A numeric value with the desired allocation size (in bytes). +##' @return For the setter, \code{TRUE} is returned invisibly but the function is invoked for the +##' side effect of storing the value. For the getters, the value as a numeric. +##' @export +save_allocation_size_preference <- function(value) { + stopifnot(`This function relies on R version 4.0.0 or later.` = R.version.string >= "4.0.0", + `The 'value' has to be numeric` = is.numeric(value)) + + cfgdir <- tools::R_user_dir(packageName()) + if (!dir.exists(cfgdir)) dir.create(cfgdir) + fname <- file.path(cfgdir, "config.dcf") + con <- file(fname, "w+") + cat("allocation_size:", value, "\n", file=con) + close(con) + set_allocation_size_preference(value) + invisible(TRUE) +} + +##' @rdname save_allocation_size_preference +##' @export +load_allocation_size_preference <- function() { + value <- NA_integer_ # flag as unset + ## we cannot set this from the TileDB config at package load time as we + ## cannot yet call a package function (to access TileDB configuration) + ## while the package is loaded + cfgfile <- .defaultConfigFile() + if (cfgfile != "" && file.exists(cfgfile)) { + cfg <- read.dcf(cfgfile) + if ("allocation_size" %in% colnames(cfg)) + value <- as.numeric(cfg[[1, "allocation_size"]]) + } + set_allocation_size_preference(value) + value +} + +##' @rdname save_allocation_size_preference +##' @export +get_allocation_size_preference <- function() { + val <- .pkgenv[["allocation_size"]] + if (is.na(val)) { # no value was stored + ## we cannot set this from the TileDB config at package load time as we + ## cannot yet call a package function (to access TileDB configuration) + ## while the package is loaded + cfg <- tiledb_config() + val <- as.numeric(cfg["sm.memory_budget"]) + if (is.na(val)) val <- 1e5 + set_allocation_size_preference(val) + } + val +} + +##' @rdname save_allocation_size_preference +##' @export +set_allocation_size_preference <- function(value) { + stopifnot(`The 'value' has to be numeric` = is.numeric(value)) + .pkgenv[["allocation_size"]] <- value +} + + + + is.scalar <- function(x, typestr) { (typeof(x) == typestr) && is.atomic(x) && length(x) == 1L } diff --git a/R/zzz.R b/R/zzz.R index be005bd646..deaa39e1d7 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -49,6 +49,9 @@ ## set a preference for data.frame conversion for tiledb_array and [] access .pkgenv[["return_as"]] <- load_return_as_preference() + + ## set a preference for allocation size defaults + .pkgenv[["allocation_size"]] <- load_allocation_size_preference() } .onAttach <- function(libname, pkgName) { diff --git a/inst/tinytest/test_dimsubset.R b/inst/tinytest/test_dimsubset.R index 7155800f1e..49502fed57 100644 --- a/inst/tinytest/test_dimsubset.R +++ b/inst/tinytest/test_dimsubset.R @@ -17,6 +17,7 @@ dir.create(tmp <- tempfile()) library(nycflights13) +set_allocation_size_preference(1e8) dom <- tiledb_domain(dims = c(tiledb_dim("carrier", NULL, NULL, "ASCII"), tiledb_dim("origin", NULL, NULL, "ASCII"), tiledb_dim("dest", NULL, NULL, "ASCII"), diff --git a/man/save_allocation_size_preference.Rd b/man/save_allocation_size_preference.Rd new file mode 100644 index 0000000000..5e6612e77e --- /dev/null +++ b/man/save_allocation_size_preference.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Utils.R +\name{save_allocation_size_preference} +\alias{save_allocation_size_preference} +\alias{load_allocation_size_preference} +\alias{get_allocation_size_preference} +\alias{set_allocation_size_preference} +\title{Store allocation size preference} +\usage{ +save_allocation_size_preference(value) + +load_allocation_size_preference() + +get_allocation_size_preference() + +set_allocation_size_preference(value) +} +\arguments{ +\item{value}{A numeric value with the desired allocation size (in bytes).} +} +\value{ +For the setter, \code{TRUE} is returned invisibly but the function is invoked for the +side effect of storing the value. For the getters, the value as a numeric. +} +\description{ +Save (or load) allocation size default preference in an optional +config file +} +\details{ +When retrieving data from sparse arrays, allocation sizes cannot +be determined \emph{ex ante} as the degree of sparsity is unknown. +A configuration value can aide in providing an allocation size +value. These functions let the user store such a value for +retrieval by the package code. The preference will be enconded in +a configuration file as R (version 4.0.0 or later) allows a user- +and package specific configuration files. These helper functions +sets and retrieve the value, respectively, or retrieve the cached +value from the package environment where is it set at package +load. + +The value will be stored as a character value and reparsed so +\sQuote{1e6} and \sQuote{1000000} are equivalent, and the fixed +(but adjustable) number of digits for numerical precision +\emph{use for formatting} will impact the writing. This should +have no effect on standard allocation sizes. +} +\note{ +This function requires R version 4.0.0 or later to utilise the per-user +config directory accessor function. For older R versions, a fallback from the +TileDB configuration object is used. +} From 3cd454fcdd667daebacc92281b7d4dd220817914 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Thu, 13 Jan 2022 15:52:27 -0600 Subject: [PATCH 3/4] clarify memory budget is per column --- R/TileDBArray.R | 10 +++++----- R/Utils.R | 5 ++++- R/zzz.R | 2 +- man/save_allocation_size_preference.Rd | 3 +++ 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/R/TileDBArray.R b/R/TileDBArray.R index 0fd4b76e69..05c9bd28cb 100644 --- a/R/TileDBArray.R +++ b/R/TileDBArray.R @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2017-2021 TileDB Inc. +# Copyright (c) 2017-2022 TileDB Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -517,10 +517,10 @@ setMethod("[", "tiledb_array", allnullable <- attrnullable } - ## A preference can be set in a local per-user configuration file; if not value - ## is set a fallback from the TileDB config object is used. As that value is fairly - ## large we scale by the number of columns not use this amount per buffer - memory_budget <- get_allocation_size_preference() / length(allnames) + ## A preference can be set in a local per-user configuration file; if no value + ## is set a fallback from the TileDB config object is used. Note that this memory + ## budget (currently, at least) applies only to character columns. + memory_budget <- get_allocation_size_preference() if (length(enckey) > 0) { if (length(tstamp) > 0) { diff --git a/R/Utils.R b/R/Utils.R index 447077aab3..a501b97f21 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2017-2021 TileDB Inc. +# Copyright (c) 2017-2022 TileDB Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -105,6 +105,9 @@ set_return_as_preference <- function(value = c("asis", "array", "matrix", "data. ##' \emph{use for formatting} will impact the writing. This should ##' have no effect on standard allocation sizes. ##' +##' Note that this memory budget (currently, at least) applies only to +##' character columns. +##' ##' @note This function requires R version 4.0.0 or later to utilise the per-user ##' config directory accessor function. For older R versions, a fallback from the ##' TileDB configuration object is used. diff --git a/R/zzz.R b/R/zzz.R index deaa39e1d7..3352ed8748 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2017-2021 TileDB Inc. +# Copyright (c) 2017-2022 TileDB Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/man/save_allocation_size_preference.Rd b/man/save_allocation_size_preference.Rd index 5e6612e77e..7af0b96311 100644 --- a/man/save_allocation_size_preference.Rd +++ b/man/save_allocation_size_preference.Rd @@ -43,6 +43,9 @@ The value will be stored as a character value and reparsed so (but adjustable) number of digits for numerical precision \emph{use for formatting} will impact the writing. This should have no effect on standard allocation sizes. + +Note that this memory budget (currently, at least) applies only to +character columns. } \note{ This function requires R version 4.0.0 or later to utilise the per-user From 5345d72fc39fb7210af59857d4c5436d2b707949 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 14 Jan 2022 14:11:05 -0600 Subject: [PATCH 4/4] set a fallback on load if no other value is set --- R/ArraySchema.R | 2 +- R/Utils.R | 24 +++++------------------- man/describe.Rd | 2 +- man/save_allocation_size_preference.Rd | 3 ++- 4 files changed, 9 insertions(+), 22 deletions(-) diff --git a/R/ArraySchema.R b/R/ArraySchema.R index a5f0045b8a..17e0e9e023 100644 --- a/R/ArraySchema.R +++ b/R/ArraySchema.R @@ -783,7 +783,7 @@ tiledb_schema_object <- function(array) { #' Describe a TileDB array schema via code to create it #' -#' @param array A TileDB Array object +#' @param arr A TileDB Array object #' @return Nothing is returned as the function is invoked for the side effect #' of printing the schema via a sequence of R instructions to re-create it. #' @export diff --git a/R/Utils.R b/R/Utils.R index a501b97f21..efd2fd3fec 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -106,7 +106,8 @@ set_return_as_preference <- function(value = c("asis", "array", "matrix", "data. ##' have no effect on standard allocation sizes. ##' ##' Note that this memory budget (currently, at least) applies only to -##' character columns. +##' character columns. A fallback value of 50mb is used if no user +##' value is set. ##' ##' @note This function requires R version 4.0.0 or later to utilise the per-user ##' config directory accessor function. For older R versions, a fallback from the @@ -133,11 +134,8 @@ save_allocation_size_preference <- function(value) { ##' @rdname save_allocation_size_preference ##' @export load_allocation_size_preference <- function() { - value <- NA_integer_ # flag as unset - ## we cannot set this from the TileDB config at package load time as we - ## cannot yet call a package function (to access TileDB configuration) - ## while the package is loaded - cfgfile <- .defaultConfigFile() + value <- 50 * 1024 * 1024 # fallback value is 50mb + cfgfile <- .defaultConfigFile() # but check config file if (cfgfile != "" && file.exists(cfgfile)) { cfg <- read.dcf(cfgfile) if ("allocation_size" %in% colnames(cfg)) @@ -149,19 +147,7 @@ load_allocation_size_preference <- function() { ##' @rdname save_allocation_size_preference ##' @export -get_allocation_size_preference <- function() { - val <- .pkgenv[["allocation_size"]] - if (is.na(val)) { # no value was stored - ## we cannot set this from the TileDB config at package load time as we - ## cannot yet call a package function (to access TileDB configuration) - ## while the package is loaded - cfg <- tiledb_config() - val <- as.numeric(cfg["sm.memory_budget"]) - if (is.na(val)) val <- 1e5 - set_allocation_size_preference(val) - } - val -} +get_allocation_size_preference <- function() .pkgenv[["allocation_size"]] ##' @rdname save_allocation_size_preference ##' @export diff --git a/man/describe.Rd b/man/describe.Rd index 4e5a01c2a0..6454b81a65 100644 --- a/man/describe.Rd +++ b/man/describe.Rd @@ -7,7 +7,7 @@ describe(arr) } \arguments{ -\item{array}{A TileDB Array object} +\item{arr}{A TileDB Array object} } \value{ Nothing is returned as the function is invoked for the side effect diff --git a/man/save_allocation_size_preference.Rd b/man/save_allocation_size_preference.Rd index 7af0b96311..9a8eaa5aaa 100644 --- a/man/save_allocation_size_preference.Rd +++ b/man/save_allocation_size_preference.Rd @@ -45,7 +45,8 @@ The value will be stored as a character value and reparsed so have no effect on standard allocation sizes. Note that this memory budget (currently, at least) applies only to -character columns. +character columns. A fallback value of 50mb is used if no user +value is set. } \note{ This function requires R version 4.0.0 or later to utilise the per-user