TileDB-Inc · aaronwolen · May 13, 2022 · May 9, 2022 · May 9, 2022 · May 9, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,7 +5,7 @@ Description: Store and retrieve single cell data using TileDB and the on-disk
   format proposed in the Unified Single Cell Data Model and API. Users can
   import from and export to in-memory formats used by popular toolchains like
   Seurat and Bioconductor SingleCellExperiment.
-Version: 0.1.0.9008
+Version: 0.1.0.9009
 Authors@R: c(
     person(given = "Aaron",
            family = "Wolen",

diff --git a/NEWS.md b/NEWS.md
@@ -16,9 +16,12 @@ See [TileDB 2.8 release notes](https://github.com/TileDB-Inc/TileDB/releases/tag
 - `get_array()` has been replaced with `get_member()` which add a `type` argument to filter by object type
 - gain the following methods: `count_members()`, `list_members()`, `list_member_uris()`, and `add_member()`
 
-### Other
+### SCGroup
 
 - the `scgroup_uris` argument has been dropped from `SCDataset`'s initialize method (`add_member()` should now be used instead to add additional `SCGroup`s)
+
+### SCDataset
+
 - `SCDataset`'s `scgroups` field is now an active binding that filters `members` for `SCGroup` objects
 
 ## Other changes
@@ -30,3 +33,4 @@ See [TileDB 2.8 release notes](https://github.com/TileDB-Inc/TileDB/releases/tag
 * Internally group members are now added with names
 * New internal `TileDBURI` class for handling  various URI formats
 * The `uri` field for all TileDB(Array|Group)-based classes is now an active binding that retrieves the URI from the private `tiledb_uri` field
+* Several default parameters have been changed to store the the `X`, `obs`, and `var` arrays more efficiently on disk (#50)
diff --git a/R/AnnotationArray.R b/R/AnnotationArray.R
@@ -36,16 +36,7 @@ AnnotationArray <- R6::R6Class(
         "'x' must contain column names matching the supplied index column(s)"
         = all(index_cols %in% colnames(x))
       )
-
-      if (self$verbose) {
-        msg <- sprintf(
-          "Creating new %s array with index [%s] at '%s'",
-          self$class(),
-          paste0(index_cols, collapse = ","),
-          self$uri
-        )
-        message(msg)
-      }
+      private$log_array_creation(index_cols)
 
       tiledb::fromDataFrame(
         obj = x,
@@ -61,12 +52,33 @@ AnnotationArray <- R6::R6Class(
     # @description Ingest assay/annotation data into the TileDB array
     # @param x A [`data.frame`]
     ingest_data = function(x) {
-      if (self$verbose) {
-        message(glue::glue("Ingesting {self$class()} data into: {self$uri}"))
-      }
+      private$log_array_ingestion()
       tdb_array <- tiledb::tiledb_array(self$uri, query_type = "WRITE")
       tdb_array[] <- x
       tiledb::tiledb_array_close(tdb_array)
+    },
+
+    log_array_creation = function(index_cols) {
+      if (self$verbose) {
+        msg <- sprintf(
+          "Creating new %s array with index [%s] at '%s'",
+          self$class(),
+          paste0(index_cols, collapse = ","),
+          self$uri
+        )
+        message(msg)
+      }
+    },
+
+    log_array_ingestion = function() {
+      if (self$verbose) {
+        msg <- sprintf(
+          "Ingesting %s data into: %s",
+          self$class(),
+          self$uri
+        )
+        message(msg)
+      }
     }
   )
 )
diff --git a/R/AnnotationDataframe.R b/R/AnnotationDataframe.R
@@ -32,7 +32,13 @@ AnnotationDataframe <- R6::R6Class(
       # convert rownames to a column
       x[[index_col]] <- rownames(x)
       if (!self$array_exists()) {
-        private$create_empty_array(x, index_col)
+        # TODO: Replace with configurable SOMAOptions class
+        capacity <- switch(basename(self$uri),
+          obs = 256L,
+          var = 2048L,
+          10000L
+        )
+        private$create_empty_array(x, index_col, capacity = capacity)
       } else {
         message(sprintf("Updating existing %s at '%s'", self$class(), self$uri))
       }

diff --git a/R/AssayMatrix.R b/R/AssayMatrix.R
@@ -45,7 +45,7 @@ AssayMatrix <- R6::R6Class(
     #' @description Ingest assay data from a COO-formatted data frame
     #' @param x a [`data.frame`]
     #' @param index_cols A column index, either numeric with a column index, or
-    #' character with a column name, designating one or more index columns. All
+    #' character with a column name, identifying the 2 index columns. All
     #' other columns are ingested as attributes.
     from_dataframe = function(x, index_cols) {
       stopifnot(
@@ -104,21 +104,95 @@ AssayMatrix <- R6::R6Class(
 
   private = list(
 
+    # @description Create an empty TileDB array with a schema optimized for 2D
+    # COO-formatted data.
+    create_empty_array = function(
+      x,
+      index_cols = c("obs_id", "var_id"),
+      cell_order = "ROW_MAJOR",
+      tile_order = "ROW_MAJOR",
+      capacity = 100000) {
+
+      # determine appropriate type for each attribute
+      value_cols <- setdiff(colnames(x), index_cols)
+      stopifnot(
+        "'x' must contain >=1 non-indexing columns" = length(value_cols) >= 1
+      )
+      value_types <- vapply_char(x[value_cols], tiledb::r_to_tiledb_type)
+
+      # array dimensions
+      tdb_dims <- mapply(
+        FUN = tiledb::tiledb_dim,
+        name = index_cols,
+        MoreArgs = list(
+          type = "ASCII",
+          domain = NULL,
+          tile = NULL
+        ),
+        SIMPLIFY = FALSE
+      )
+
+      tiledb::filter_list(tdb_dims[[1]]) <- tiledb::tiledb_filter_list(
+        tiledb::tiledb_filter("RLE")
+      )
+
+      # TODO: Make zstd compression level configurable, currently using same
+      # default as core: https://github.com/TileDB-Inc/TileDB/blob/56644c1e94fcba26d07a608112fdcdf3fd120ba8/tiledb/sm/filter/compression_filter.h#L154
+      tiledb::filter_list(tdb_dims[[2]]) <- tiledb::tiledb_filter_list(
+        tiledb::tiledb_filter_set_option(
+          object = tiledb::tiledb_filter("ZSTD"),
+          option = "COMPRESSION_LEVEL",
+          value = 3L
+        )
+      )
+
+      # array attributes
+      tdb_attr_filter <- tiledb::tiledb_filter_set_option(
+        object = tiledb::tiledb_filter("ZSTD"),
+        option = "COMPRESSION_LEVEL",
+        value = 3L
+      )
+
+      tdb_attrs <- mapply(
+        FUN = tiledb::tiledb_attr,
+        name = value_cols,
+        type = value_types,
+        MoreArgs = list(
+          filter_list = tiledb::tiledb_filter_list(tdb_attr_filter),
+          ctx = self$ctx
+        ),
+        SIMPLIFY = FALSE
+      )
+
+      # array schema
+      tdb_schema <- tiledb::tiledb_array_schema(
+        domain = tiledb::tiledb_domain(tdb_dims),
+        attrs = tdb_attrs,
+        cell_order = cell_order,
+        tile_order = tile_order,
+        sparse = TRUE,
+        capacity = capacity,
+        offsets_filter_list = tiledb::tiledb_filter_list(c(
+          tiledb::tiledb_filter("DOUBLE_DELTA"),
+          tiledb::tiledb_filter("BIT_WIDTH_REDUCTION"),
+          tiledb::tiledb_filter("ZSTD")
+        ))
+      )
+
+      private$log_array_creation(index_cols)
+      tiledb::tiledb_array_create(uri = self$uri, schema = tdb_schema)
+    },
+
     # @description Ingest assay data into the TileDB array.
     # @param x A [`data.frame`] containing the assay data.
     # @param index_cols Character vector with column names to use as index
     ingest_data = function(x, index_cols) {
       stopifnot(
         "Assay data must be a data.frame" = is.data.frame(x)
       )
-
-      if (self$verbose) message("Ingesting assay data into ", self$uri)
-      tiledb::fromDataFrame(
-        obj = x,
-        uri = self$uri,
-        col_index = index_cols,
-        mode = "append"
-      )
+      private$log_array_ingestion()
+      tdb_array <- tiledb::tiledb_array(self$uri, query_type = "WRITE")
+      tdb_array[] <- x
     }
   )
 )
diff --git a/inst/bench/run-benchmarks.R b/inst/bench/run-benchmarks.R
@@ -0,0 +1,134 @@
+library(tiledbsc)
+library(lobstr)
+library(tiledb)
+library(SeuratObject)
+
+# devtools::install_github('satijalab/seurat-data')
+library(SeuratData)
+
+# parameters -------------------------------------------------------------------
+
+# where the new arrays will be stored
+array_dir <- "dev/data/arrays"
+
+# where the benchmark results will be stored
+output_dir <- "dev/data/benchmarks"
+
+# should the new arrays be deleted after the benchmark?
+clean <- FALSE
+
+# should datasets be installed if they are not already?
+install_missing <- TRUE
+
+
+# outputs ----------------------------------------------------------------------
+
+# create the output file
+output_metadata <- sprintf(
+  "%s_tiledbsc%s_tiledb%s",
+  format(Sys.time(),"%Y%m%d-%H%M"),
+  packageVersion("tiledbsc"),
+  packageVersion("tiledb")
+)
+
+output_name <- paste0("ingestion-benchmarks_", output_metadata, ".csv")
+
+output_file <- file.path(output_dir, output_name)
+
+# create parent directory using file metadata
+array_dir <- file.path(array_dir, output_metadata)
+dir.create(array_dir, showWarnings = FALSE, recursive = TRUE)
+dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
+
+# setup datasets ---------------------------------------------------------------
+datasets <- AvailableData()
+
+# remove datasets with Versions < 2.0.0 to avoid the following error:
+# Not a validObject(): no slot of name "images" for this object of class "Seurat"
+datasets <- subset(datasets, Version >= package_version("2.0.0"))
+
+# skip datasets that cause errors
+blocklist <- c(
+  # Not a validObject(): no slot of name "images" for this object of class "Seurat"
+  # I believe this error is specific to datasets with Versions < 2.0.0
+  "humancortexref.SeuratData",
+  "kidneyref.SeuratData",
+
+  # Datasets that are already installed but LoadData() errors with:
+  # Error: Could not find dataset '<>', please check manifest and try again
+  "bonemarrowref.SeuratData",
+
+  # Failed to download
+  "lungref.SeuratData"
+)
+
+# Temporarily subset
+datasets <- datasets[1:min(10L, nrow(datasets)),]
+
+# setup results ----------------------------------------------------------------
+
+# initialize results dataframe
+benchmarks <- data.frame(
+  size_memory = numeric(),
+  size_rds = numeric(),
+  size_tiledb = numeric(),
+  ingest_time = numeric()
+)
+
+# main -------------------------------------------------------------------------
+for (ds_name in rownames(datasets)) {
+  message(sprintf("Dataset: %s", ds_name))
+  if (ds_name %in% blocklist) {
+    message(sprintf("..skipping blocklisted dataset: %s", ds_name))
+    next
+  }
+
+  if (!datasets[ds_name, "Installed"]) {
+    skip <- FALSE
+    if (interactive()) {
+      if (menu(c("yes", "no"), sprintf("Install %s?", ds_name)) == 2) skip <- TRUE
+    } else {
+      if (!install_missing) skip <- TRUE
+    }
+
+    if (skip) {
+      message(sprintf("..skipping dataset not installed: %s", ds_name))
+      next
+    }
+
+    message(sprintf("..installing dataset '%s'", ds_name))
+    install_worked <- try(InstallData(ds_name), silent = TRUE)
+    if (inherits(install_worked, "try-error")) {
+      message(sprintf("..failed to install dataset '%s'", ds_name))
+      next
+    }
+  }
+
+  message("..loading dataset")
+  ds <- LoadData(ds_name)
+
+  uri <- file.path(array_dir, ds_name)
+  message(sprintf("..ingesting data into '%s'", uri))
+
+  ingest_start <- Sys.time()
+  scgroup <- SCGroup$new(uri = uri)
+  scgroup$from_seurat_assay(
+    object = ds[[DefaultAssay(ds)]],
+    obs = ds[[]]
+  )
+
+  benchmarks[ds_name, ] <- list(
+    size_memory = as.numeric(obj_size(ds)),
+    size_rds = tiledb_vfs_dir_size(system.file("data", package = ds_name)),
+    size_tiledb = tiledb_vfs_dir_size(uri),
+    ingest_time = as.numeric(difftime(Sys.time(), ingest_start, units = "secs"))
+  )
+
+  # write each time to avoid losing data on failures
+  write.csv(benchmarks, file = output_file, quote = FALSE)
+
+  if (clean) {
+    message(sprintf("..cleaning dataset '%s'", ds_name))
+    tiledb_vfs_remove_dir(uri)
+  }
+}
diff --git a/man/AssayMatrix.Rd b/man/AssayMatrix.Rd