Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use CZI default parameters for X/obs/var #50

Merged
merged 12 commits into from
May 13, 2022
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Description: Store and retrieve single cell data using TileDB and the on-disk
format proposed in the Unified Single Cell Data Model and API. Users can
import from and export to in-memory formats used by popular toolchains like
Seurat and Bioconductor SingleCellExperiment.
Version: 0.1.0.9008
Version: 0.1.0.9009
Authors@R: c(
person(given = "Aaron",
family = "Wolen",
Expand Down
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ See [TileDB 2.8 release notes](https://github.com/TileDB-Inc/TileDB/releases/tag
- `get_array()` has been replaced with `get_member()` which add a `type` argument to filter by object type
- gain the following methods: `count_members()`, `list_members()`, `list_member_uris()`, and `add_member()`

### Other
### SCGroup

- the `scgroup_uris` argument has been dropped from `SCDataset`'s initialize method (`add_member()` should now be used instead to add additional `SCGroup`s)

### SCDataset

- `SCDataset`'s `scgroups` field is now an active binding that filters `members` for `SCGroup` objects

## Other changes
Expand All @@ -30,3 +33,4 @@ See [TileDB 2.8 release notes](https://github.com/TileDB-Inc/TileDB/releases/tag
* Internally group members are now added with names
* New internal `TileDBURI` class for handling various URI formats
* The `uri` field for all TileDB(Array|Group)-based classes is now an active binding that retrieves the URI from the private `tiledb_uri` field
* Several default parameters have been changed to store the the `X`, `obs`, and `var` arrays more efficiently on disk (#50)
38 changes: 25 additions & 13 deletions R/AnnotationArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,7 @@ AnnotationArray <- R6::R6Class(
"'x' must contain column names matching the supplied index column(s)"
= all(index_cols %in% colnames(x))
)

if (self$verbose) {
msg <- sprintf(
"Creating new %s array with index [%s] at '%s'",
self$class(),
paste0(index_cols, collapse = ","),
self$uri
)
message(msg)
}
private$log_array_creation(index_cols)

tiledb::fromDataFrame(
obj = x,
Expand All @@ -61,12 +52,33 @@ AnnotationArray <- R6::R6Class(
# @description Ingest assay/annotation data into the TileDB array
# @param x A [`data.frame`]
ingest_data = function(x) {
if (self$verbose) {
message(glue::glue("Ingesting {self$class()} data into: {self$uri}"))
}
private$log_array_ingestion()
tdb_array <- tiledb::tiledb_array(self$uri, query_type = "WRITE")
tdb_array[] <- x
tiledb::tiledb_array_close(tdb_array)
},

log_array_creation = function(index_cols) {
if (self$verbose) {
msg <- sprintf(
"Creating new %s array with index [%s] at '%s'",
self$class(),
paste0(index_cols, collapse = ","),
self$uri
)
message(msg)
}
},

log_array_ingestion = function() {
if (self$verbose) {
msg <- sprintf(
"Ingesting %s data into: %s",
self$class(),
self$uri
)
message(msg)
}
}
)
)
8 changes: 7 additions & 1 deletion R/AnnotationDataframe.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,13 @@ AnnotationDataframe <- R6::R6Class(
# convert rownames to a column
x[[index_col]] <- rownames(x)
if (!self$array_exists()) {
private$create_empty_array(x, index_col)
# TODO: Replace with configurable SOMAOptions class
capacity <- switch(basename(self$uri),
obs = 256L,
var = 2048L,
10000L
)
private$create_empty_array(x, index_col, capacity = capacity)
} else {
message(sprintf("Updating existing %s at '%s'", self$class(), self$uri))
}
Expand Down
92 changes: 83 additions & 9 deletions R/AssayMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ AssayMatrix <- R6::R6Class(
#' @description Ingest assay data from a COO-formatted data frame
#' @param x a [`data.frame`]
#' @param index_cols A column index, either numeric with a column index, or
#' character with a column name, designating one or more index columns. All
#' character with a column name, identifying the 2 index columns. All
#' other columns are ingested as attributes.
from_dataframe = function(x, index_cols) {
stopifnot(
Expand Down Expand Up @@ -104,21 +104,95 @@ AssayMatrix <- R6::R6Class(

private = list(

# @description Create an empty TileDB array with a schema optimized for 2D
# COO-formatted data.
create_empty_array = function(
x,
index_cols = c("obs_id", "var_id"),
cell_order = "ROW_MAJOR",
tile_order = "ROW_MAJOR",
capacity = 100000) {

# determine appropriate type for each attribute
value_cols <- setdiff(colnames(x), index_cols)
stopifnot(
"'x' must contain >=1 non-indexing columns" = length(value_cols) >= 1
)
value_types <- vapply_char(x[value_cols], tiledb::r_to_tiledb_type)

# array dimensions
tdb_dims <- mapply(
FUN = tiledb::tiledb_dim,
name = index_cols,
MoreArgs = list(
type = "ASCII",
domain = NULL,
tile = NULL
),
SIMPLIFY = FALSE
)

tiledb::filter_list(tdb_dims[[1]]) <- tiledb::tiledb_filter_list(
tiledb::tiledb_filter("RLE")
)

# TODO: Make zstd compression level configurable, currently using same
# default as core: https://github.com/TileDB-Inc/TileDB/blob/56644c1e94fcba26d07a608112fdcdf3fd120ba8/tiledb/sm/filter/compression_filter.h#L154
tiledb::filter_list(tdb_dims[[2]]) <- tiledb::tiledb_filter_list(
tiledb::tiledb_filter_set_option(
object = tiledb::tiledb_filter("ZSTD"),
option = "COMPRESSION_LEVEL",
value = 3L
)
)

# array attributes
tdb_attr_filter <- tiledb::tiledb_filter_set_option(
object = tiledb::tiledb_filter("ZSTD"),
option = "COMPRESSION_LEVEL",
value = 3L
)

tdb_attrs <- mapply(
FUN = tiledb::tiledb_attr,
name = value_cols,
type = value_types,
MoreArgs = list(
filter_list = tiledb::tiledb_filter_list(tdb_attr_filter),
ctx = self$ctx
),
SIMPLIFY = FALSE
)

# array schema
tdb_schema <- tiledb::tiledb_array_schema(
domain = tiledb::tiledb_domain(tdb_dims),
attrs = tdb_attrs,
cell_order = cell_order,
tile_order = tile_order,
sparse = TRUE,
capacity = capacity,
offsets_filter_list = tiledb::tiledb_filter_list(c(
tiledb::tiledb_filter("DOUBLE_DELTA"),
tiledb::tiledb_filter("BIT_WIDTH_REDUCTION"),
tiledb::tiledb_filter("ZSTD")
))
)

private$log_array_creation(index_cols)
tiledb::tiledb_array_create(uri = self$uri, schema = tdb_schema)
},

# @description Ingest assay data into the TileDB array.
# @param x A [`data.frame`] containing the assay data.
# @param index_cols Character vector with column names to use as index
ingest_data = function(x, index_cols) {
stopifnot(
"Assay data must be a data.frame" = is.data.frame(x)
)

if (self$verbose) message("Ingesting assay data into ", self$uri)
tiledb::fromDataFrame(
obj = x,
uri = self$uri,
col_index = index_cols,
mode = "append"
)
private$log_array_ingestion()
tdb_array <- tiledb::tiledb_array(self$uri, query_type = "WRITE")
tdb_array[] <- x
}
)
)
134 changes: 134 additions & 0 deletions inst/bench/run-benchmarks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
library(tiledbsc)
library(lobstr)
library(tiledb)
library(SeuratObject)

# devtools::install_github('satijalab/seurat-data')
library(SeuratData)

# parameters -------------------------------------------------------------------

# where the new arrays will be stored
array_dir <- "dev/data/arrays"

# where the benchmark results will be stored
output_dir <- "dev/data/benchmarks"

# should the new arrays be deleted after the benchmark?
clean <- FALSE

# should datasets be installed if they are not already?
install_missing <- TRUE


# outputs ----------------------------------------------------------------------

# create the output file
output_metadata <- sprintf(
"%s_tiledbsc%s_tiledb%s",
format(Sys.time(),"%Y%m%d-%H%M"),
packageVersion("tiledbsc"),
packageVersion("tiledb")
)

output_name <- paste0("ingestion-benchmarks_", output_metadata, ".csv")

output_file <- file.path(output_dir, output_name)

# create parent directory using file metadata
array_dir <- file.path(array_dir, output_metadata)
dir.create(array_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

# setup datasets ---------------------------------------------------------------
datasets <- AvailableData()

# remove datasets with Versions < 2.0.0 to avoid the following error:
# Not a validObject(): no slot of name "images" for this object of class "Seurat"
datasets <- subset(datasets, Version >= package_version("2.0.0"))

# skip datasets that cause errors
blocklist <- c(
# Not a validObject(): no slot of name "images" for this object of class "Seurat"
# I believe this error is specific to datasets with Versions < 2.0.0
"humancortexref.SeuratData",
"kidneyref.SeuratData",

# Datasets that are already installed but LoadData() errors with:
# Error: Could not find dataset '<>', please check manifest and try again
"bonemarrowref.SeuratData",

# Failed to download
"lungref.SeuratData"
)

# Temporarily subset
datasets <- datasets[1:min(10L, nrow(datasets)),]

# setup results ----------------------------------------------------------------

# initialize results dataframe
benchmarks <- data.frame(
size_memory = numeric(),
size_rds = numeric(),
size_tiledb = numeric(),
ingest_time = numeric()
)

# main -------------------------------------------------------------------------
for (ds_name in rownames(datasets)) {
message(sprintf("Dataset: %s", ds_name))
if (ds_name %in% blocklist) {
message(sprintf("..skipping blocklisted dataset: %s", ds_name))
next
}

if (!datasets[ds_name, "Installed"]) {
skip <- FALSE
if (interactive()) {
if (menu(c("yes", "no"), sprintf("Install %s?", ds_name)) == 2) skip <- TRUE
} else {
if (!install_missing) skip <- TRUE
}

if (skip) {
message(sprintf("..skipping dataset not installed: %s", ds_name))
next
}

message(sprintf("..installing dataset '%s'", ds_name))
install_worked <- try(InstallData(ds_name), silent = TRUE)
if (inherits(install_worked, "try-error")) {
message(sprintf("..failed to install dataset '%s'", ds_name))
next
}
}

message("..loading dataset")
ds <- LoadData(ds_name)

uri <- file.path(array_dir, ds_name)
message(sprintf("..ingesting data into '%s'", uri))

ingest_start <- Sys.time()
scgroup <- SCGroup$new(uri = uri)
scgroup$from_seurat_assay(
object = ds[[DefaultAssay(ds)]],
obs = ds[[]]
)

benchmarks[ds_name, ] <- list(
size_memory = as.numeric(obj_size(ds)),
size_rds = tiledb_vfs_dir_size(system.file("data", package = ds_name)),
size_tiledb = tiledb_vfs_dir_size(uri),
ingest_time = as.numeric(difftime(Sys.time(), ingest_start, units = "secs"))
)

# write each time to avoid losing data on failures
write.csv(benchmarks, file = output_file, quote = FALSE)

if (clean) {
message(sprintf("..cleaning dataset '%s'", ds_name))
tiledb_vfs_remove_dir(uri)
}
}
2 changes: 1 addition & 1 deletion man/AssayMatrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.