Skip to content

Commit

Permalink
Merge pull request #50 from TileDB-Inc/aaronwolen/sc-17557/adopt-czi-…
Browse files Browse the repository at this point in the history
…default-layout-options

Use CZI default parameters for X/obs/var
  • Loading branch information
aaronwolen committed May 13, 2022
2 parents d70b6fc + 5eedbd7 commit d577e50
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 26 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Description: Store and retrieve single cell data using TileDB and the on-disk
format proposed in the Unified Single Cell Data Model and API. Users can
import from and export to in-memory formats used by popular toolchains like
Seurat and Bioconductor SingleCellExperiment.
Version: 0.1.0.9008
Version: 0.1.0.9009
Authors@R: c(
person(given = "Aaron",
family = "Wolen",
Expand Down
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ See [TileDB 2.8 release notes](https://github.com/TileDB-Inc/TileDB/releases/tag
- `get_array()` has been replaced with `get_member()` which add a `type` argument to filter by object type
- gain the following methods: `count_members()`, `list_members()`, `list_member_uris()`, and `add_member()`

### Other
### SCGroup

- the `scgroup_uris` argument has been dropped from `SCDataset`'s initialize method (`add_member()` should now be used instead to add additional `SCGroup`s)

### SCDataset

- `SCDataset`'s `scgroups` field is now an active binding that filters `members` for `SCGroup` objects

## Other changes
Expand All @@ -30,3 +33,4 @@ See [TileDB 2.8 release notes](https://github.com/TileDB-Inc/TileDB/releases/tag
* Internally group members are now added with names
* New internal `TileDBURI` class for handling various URI formats
* The `uri` field for all TileDB(Array|Group)-based classes is now an active binding that retrieves the URI from the private `tiledb_uri` field
* Several default parameters have been changed to store the the `X`, `obs`, and `var` arrays more efficiently on disk (#50)
38 changes: 25 additions & 13 deletions R/AnnotationArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,7 @@ AnnotationArray <- R6::R6Class(
"'x' must contain column names matching the supplied index column(s)"
= all(index_cols %in% colnames(x))
)

if (self$verbose) {
msg <- sprintf(
"Creating new %s array with index [%s] at '%s'",
self$class(),
paste0(index_cols, collapse = ","),
self$uri
)
message(msg)
}
private$log_array_creation(index_cols)

tiledb::fromDataFrame(
obj = x,
Expand All @@ -61,12 +52,33 @@ AnnotationArray <- R6::R6Class(
# @description Ingest assay/annotation data into the TileDB array
# @param x A [`data.frame`]
ingest_data = function(x) {
if (self$verbose) {
message(glue::glue("Ingesting {self$class()} data into: {self$uri}"))
}
private$log_array_ingestion()
tdb_array <- tiledb::tiledb_array(self$uri, query_type = "WRITE")
tdb_array[] <- x
tiledb::tiledb_array_close(tdb_array)
},

log_array_creation = function(index_cols) {
if (self$verbose) {
msg <- sprintf(
"Creating new %s array with index [%s] at '%s'",
self$class(),
paste0(index_cols, collapse = ","),
self$uri
)
message(msg)
}
},

log_array_ingestion = function() {
if (self$verbose) {
msg <- sprintf(
"Ingesting %s data into: %s",
self$class(),
self$uri
)
message(msg)
}
}
)
)
8 changes: 7 additions & 1 deletion R/AnnotationDataframe.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,13 @@ AnnotationDataframe <- R6::R6Class(
# convert rownames to a column
x[[index_col]] <- rownames(x)
if (!self$array_exists()) {
private$create_empty_array(x, index_col)
# TODO: Replace with configurable SOMAOptions class
capacity <- switch(basename(self$uri),
obs = 256L,
var = 2048L,
10000L
)
private$create_empty_array(x, index_col, capacity = capacity)
} else {
message(sprintf("Updating existing %s at '%s'", self$class(), self$uri))
}
Expand Down
92 changes: 83 additions & 9 deletions R/AssayMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ AssayMatrix <- R6::R6Class(
#' @description Ingest assay data from a COO-formatted data frame
#' @param x a [`data.frame`]
#' @param index_cols A column index, either numeric with a column index, or
#' character with a column name, designating one or more index columns. All
#' character with a column name, identifying the 2 index columns. All
#' other columns are ingested as attributes.
from_dataframe = function(x, index_cols) {
stopifnot(
Expand Down Expand Up @@ -104,21 +104,95 @@ AssayMatrix <- R6::R6Class(

private = list(

# @description Create an empty TileDB array with a schema optimized for 2D
# COO-formatted data.
create_empty_array = function(
x,
index_cols = c("obs_id", "var_id"),
cell_order = "ROW_MAJOR",
tile_order = "ROW_MAJOR",
capacity = 100000) {

# determine appropriate type for each attribute
value_cols <- setdiff(colnames(x), index_cols)
stopifnot(
"'x' must contain >=1 non-indexing columns" = length(value_cols) >= 1
)
value_types <- vapply_char(x[value_cols], tiledb::r_to_tiledb_type)

# array dimensions
tdb_dims <- mapply(
FUN = tiledb::tiledb_dim,
name = index_cols,
MoreArgs = list(
type = "ASCII",
domain = NULL,
tile = NULL
),
SIMPLIFY = FALSE
)

tiledb::filter_list(tdb_dims[[1]]) <- tiledb::tiledb_filter_list(
tiledb::tiledb_filter("RLE")
)

# TODO: Make zstd compression level configurable, currently using same
# default as core: https://github.com/TileDB-Inc/TileDB/blob/56644c1e94fcba26d07a608112fdcdf3fd120ba8/tiledb/sm/filter/compression_filter.h#L154
tiledb::filter_list(tdb_dims[[2]]) <- tiledb::tiledb_filter_list(
tiledb::tiledb_filter_set_option(
object = tiledb::tiledb_filter("ZSTD"),
option = "COMPRESSION_LEVEL",
value = 3L
)
)

# array attributes
tdb_attr_filter <- tiledb::tiledb_filter_set_option(
object = tiledb::tiledb_filter("ZSTD"),
option = "COMPRESSION_LEVEL",
value = 3L
)

tdb_attrs <- mapply(
FUN = tiledb::tiledb_attr,
name = value_cols,
type = value_types,
MoreArgs = list(
filter_list = tiledb::tiledb_filter_list(tdb_attr_filter),
ctx = self$ctx
),
SIMPLIFY = FALSE
)

# array schema
tdb_schema <- tiledb::tiledb_array_schema(
domain = tiledb::tiledb_domain(tdb_dims),
attrs = tdb_attrs,
cell_order = cell_order,
tile_order = tile_order,
sparse = TRUE,
capacity = capacity,
offsets_filter_list = tiledb::tiledb_filter_list(c(
tiledb::tiledb_filter("DOUBLE_DELTA"),
tiledb::tiledb_filter("BIT_WIDTH_REDUCTION"),
tiledb::tiledb_filter("ZSTD")
))
)

private$log_array_creation(index_cols)
tiledb::tiledb_array_create(uri = self$uri, schema = tdb_schema)
},

# @description Ingest assay data into the TileDB array.
# @param x A [`data.frame`] containing the assay data.
# @param index_cols Character vector with column names to use as index
ingest_data = function(x, index_cols) {
stopifnot(
"Assay data must be a data.frame" = is.data.frame(x)
)

if (self$verbose) message("Ingesting assay data into ", self$uri)
tiledb::fromDataFrame(
obj = x,
uri = self$uri,
col_index = index_cols,
mode = "append"
)
private$log_array_ingestion()
tdb_array <- tiledb::tiledb_array(self$uri, query_type = "WRITE")
tdb_array[] <- x
}
)
)
134 changes: 134 additions & 0 deletions inst/bench/run-benchmarks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
library(tiledbsc)
library(lobstr)
library(tiledb)
library(SeuratObject)

# devtools::install_github('satijalab/seurat-data')
library(SeuratData)

# parameters -------------------------------------------------------------------

# where the new arrays will be stored
array_dir <- "dev/data/arrays"

# where the benchmark results will be stored
output_dir <- "dev/data/benchmarks"

# should the new arrays be deleted after the benchmark?
clean <- FALSE

# should datasets be installed if they are not already?
install_missing <- TRUE


# outputs ----------------------------------------------------------------------

# create the output file
output_metadata <- sprintf(
"%s_tiledbsc%s_tiledb%s",
format(Sys.time(),"%Y%m%d-%H%M"),
packageVersion("tiledbsc"),
packageVersion("tiledb")
)

output_name <- paste0("ingestion-benchmarks_", output_metadata, ".csv")

output_file <- file.path(output_dir, output_name)

# create parent directory using file metadata
array_dir <- file.path(array_dir, output_metadata)
dir.create(array_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

# setup datasets ---------------------------------------------------------------
datasets <- AvailableData()

# remove datasets with Versions < 2.0.0 to avoid the following error:
# Not a validObject(): no slot of name "images" for this object of class "Seurat"
datasets <- subset(datasets, Version >= package_version("2.0.0"))

# skip datasets that cause errors
blocklist <- c(
# Not a validObject(): no slot of name "images" for this object of class "Seurat"
# I believe this error is specific to datasets with Versions < 2.0.0
"humancortexref.SeuratData",
"kidneyref.SeuratData",

# Datasets that are already installed but LoadData() errors with:
# Error: Could not find dataset '<>', please check manifest and try again
"bonemarrowref.SeuratData",

# Failed to download
"lungref.SeuratData"
)

# Temporarily subset
datasets <- datasets[1:min(10L, nrow(datasets)),]

# setup results ----------------------------------------------------------------

# initialize results dataframe
benchmarks <- data.frame(
size_memory = numeric(),
size_rds = numeric(),
size_tiledb = numeric(),
ingest_time = numeric()
)

# main -------------------------------------------------------------------------
for (ds_name in rownames(datasets)) {
message(sprintf("Dataset: %s", ds_name))
if (ds_name %in% blocklist) {
message(sprintf("..skipping blocklisted dataset: %s", ds_name))
next
}

if (!datasets[ds_name, "Installed"]) {
skip <- FALSE
if (interactive()) {
if (menu(c("yes", "no"), sprintf("Install %s?", ds_name)) == 2) skip <- TRUE
} else {
if (!install_missing) skip <- TRUE
}

if (skip) {
message(sprintf("..skipping dataset not installed: %s", ds_name))
next
}

message(sprintf("..installing dataset '%s'", ds_name))
install_worked <- try(InstallData(ds_name), silent = TRUE)
if (inherits(install_worked, "try-error")) {
message(sprintf("..failed to install dataset '%s'", ds_name))
next
}
}

message("..loading dataset")
ds <- LoadData(ds_name)

uri <- file.path(array_dir, ds_name)
message(sprintf("..ingesting data into '%s'", uri))

ingest_start <- Sys.time()
scgroup <- SCGroup$new(uri = uri)
scgroup$from_seurat_assay(
object = ds[[DefaultAssay(ds)]],
obs = ds[[]]
)

benchmarks[ds_name, ] <- list(
size_memory = as.numeric(obj_size(ds)),
size_rds = tiledb_vfs_dir_size(system.file("data", package = ds_name)),
size_tiledb = tiledb_vfs_dir_size(uri),
ingest_time = as.numeric(difftime(Sys.time(), ingest_start, units = "secs"))
)

# write each time to avoid losing data on failures
write.csv(benchmarks, file = output_file, quote = FALSE)

if (clean) {
message(sprintf("..cleaning dataset '%s'", ds_name))
tiledb_vfs_remove_dir(uri)
}
}
2 changes: 1 addition & 1 deletion man/AssayMatrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit d577e50

Please sign in to comment.