Skip to content

Commit

Permalink
Add higher-level support for writing / reading sparse matrices (#209)
Browse files Browse the repository at this point in the history
* sparse matrix support for arrays

* add initial unit test file

* also commit Suggests: Matrix

* protect sparse matrix test against old Matrix versions

* expanded documentation and an example
  • Loading branch information
eddelbuettel committed Feb 15, 2021
1 parent 374dc0b commit b498493
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 3 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Expand Up @@ -24,7 +24,7 @@ SystemRequirements: cmake (only when TileDB source build selected),
option to build or download was specified by by the user.
Imports: methods, Rcpp, nanotime
LinkingTo: Rcpp
Suggests: tinytest, rmarkdown, knitr, minidown, curl, bit64
Suggests: tinytest, rmarkdown, knitr, minidown, curl, bit64, Matrix
VignetteBuilder: knitr
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Expand Up @@ -37,6 +37,7 @@ export(domain)
export(extended)
export(filter_list)
export(fromDataFrame)
export(fromSparseMatrix)
export(has_attribute)
export(is.anonymous)
export(is.integral)
Expand Down Expand Up @@ -178,6 +179,7 @@ export(tiledb_vfs_remove_file)
export(tiledb_vfs_sync)
export(tiledb_vfs_touch)
export(tiledb_vfs_write)
export(toSparseMatrix)
exportClasses(tiledb_array)
exportClasses(tiledb_array_schema)
exportClasses(tiledb_attr)
Expand Down Expand Up @@ -232,6 +234,7 @@ exportMethods(tiledb_ndim)
import(nanotime)
importFrom(Rcpp,sourceCpp)
importFrom(methods,.hasSlot)
importFrom(methods,as)
importFrom(methods,is)
importFrom(methods,new)
importFrom(methods,setClass)
Expand Down
2 changes: 1 addition & 1 deletion R/DataFrame.R
Expand Up @@ -49,7 +49,7 @@
##' default is \dQuote{COL_MAJOR}.
##' @param filter A character variable vector, defaults to \sQuote{ZSTD}, for
##' one or more filters to be applied to each attribute;
##' @param capacity A integer value with the schema capacity, default is 1000.
##' @param capacity A integer value with the schema capacity, default is 10000.
##' @param tile_domain An integer vector of size two specifying the integer domain of the row
##' dimension; if \code{NULL} the row dimension of the \code{obj} is used.
##' @param tile_extent An integer value for the tile extent of the row dimensions;
Expand Down
121 changes: 121 additions & 0 deletions R/SparseMatrix.R
@@ -0,0 +1,121 @@
# MIT License
#
# Copyright (c) 2017-2021 TileDB Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

## sparse matrix helper 'roughly similar' to fromDataFrame()

##' Create (or return) a TileDB sparse array
##'
##' The functions \code{fromSparseMatrix} and \code{toSparseMatrix} help in storing
##' (and retrieving) sparse matrices using a TileDB backend.
##' @param obj A sparse matrix object.
##' @param uri A character variable with an Array URI.
##' @param cell_order A character variable with one of the TileDB cell order values,
##' default is \dQuote{COL_MAJOR}.
##' @param tile_order A character variable with one of the TileDB tile order values,
##' default is \dQuote{COL_MAJOR}.
##' @param filter A character variable vector, defaults to \sQuote{ZSTD}, for
##' one or more filters to be applied to each attribute;
##' @param capacity A integer value with the schema capacity, default is 10000.
##' @return Null, invisibly.
##' @examples
##' \dontshow{ctx <- tiledb_ctx(limitTileDBCores())}
##' \dontrun{
##' if (requireNamespace("Matrix", quietly=TRUE)) {
##' library(Matrix)
##' set.seed(123) # just to fix it
##' mat <- matrix(0, nrow=20, ncol=10)
##' mat[sample(seq_len(200), 20)] <- seq(1, 20)
##' spmat <- as(mat, "dgTMatrix") # sparse matrix in dgTMatrix format
##' uri <- "sparse_matrix"
##' fromSparseMatrix(spmat, uri) # now written
##' chk <- toSparseMatrix(uri) # and re-read
##' print(chk)
##' all.equal(spmat, chk)
##' }
##' }
##' @importFrom methods as
##' @export
fromSparseMatrix <- function(obj,
uri,
cell_order = "ROW_MAJOR",
tile_order = "ROW_MAJOR",
filter="ZSTD",
capacity = 10000L) {

stopifnot(`obj must be Matrix object` = inherits(obj, "Matrix"),
`obj must be sparse` = is(obj, "sparseMatrix"),
`uri must character` = is.character(uri))

if (class(obj)[1] != "dgTMatrix") obj <- as(obj, "dgTMatrix")

dimi <- tiledb_dim(name="i", type = "FLOAT64", # wider range
tile = as.numeric(obj@Dim[1]),
domain = c(0, obj@Dim[1]-1L))
dimj <- tiledb_dim(name="j", type = "FLOAT64", # wider range
tile = as.numeric(obj@Dim[2]),
domain = c(0, obj@Dim[2]-1L))
dom <- tiledb_domain(dims = c(dimi, dimj))

cl <- class(obj@x)[1]
if (cl == "integer")
tp <- "INT32"
else if (cl == "numeric")
tp <- "FLOAT64"
else
stop("Currently unsupported type: ", cl)

filterlist <- tiledb_filter_list(sapply(filter, tiledb_filter))

attx <- tiledb_attr(name="x", type = tp, ncells = 1, filter_list = filterlist)
schema <- tiledb_array_schema(dom, attrs=attx,
cell_order = cell_order, tile_order = tile_order,
sparse = TRUE, capacity=capacity)
tiledb_array_create(uri, schema)
arr <- tiledb_array(uri)
arr[] <- data.frame(i = obj@i, j = obj@j, x = obj@x)
invisible(NULL)
}

##' @rdname fromSparseMatrix
##' @export
toSparseMatrix <- function(uri) {

arr <- tiledb_array(uri, as.data.frame=TRUE, query_layout="UNORDERED")
obj <- arr[]

dims <- dimensions(domain(schema(uri)))
d1 <- domain(dims[[1]]) #tiledb:::libtiledb_dim_get_domain(dims[[1]]@ptr) + 1
d2 <- domain(dims[[2]]) #tiledb:::libtiledb_dim_get_domain(dims[[2]]@ptr) + 2
stopifnot(`No column i in data`=!is.na(match("i", colnames(obj))),
`No column j in data`=!is.na(match("j", colnames(obj))),
`No column x in data`=!is.na(match("x", colnames(obj))),
`Matrix package needed`=requireNamespace("Matrix", quietly=TRUE))

sp <- Matrix::sparseMatrix(i = obj$i + 1,
j = obj$j + 1,
x = obj$x,
dims = c(d1[2] + 1, d2[2] + 1),
repr = "T")

sp

}
32 changes: 32 additions & 0 deletions inst/tinytest/test_sparsematrix.R
@@ -0,0 +1,32 @@

library(tinytest)
library(tiledb)

isOldWindows <- Sys.info()[["sysname"]] == "Windows" && grepl('Windows Server 2008', osVersion)
if (isOldWindows) exit_file("skip this file on old Windows releases")

ctx <- tiledb_ctx(limitTileDBCores())

if (!requireNamespace("Matrix", quietly=TRUE)) exit_file("Need the 'Matrix' package")
library(Matrix)
if (packageVersion("Matrix") < "1.3.0") exit_file("Old 'Matrix' package?")

set.seed(123) # just to fix it
n <- 60
k <- 50
mat <- matrix(0, nrow=n, ncol=k)
nelem <- 0.1 * n * k
mat[sample(seq_len(n*k), nelem)] <- seq(1, nelem)

## Convert dense matrix to sparse matrix
spmat <- as(mat, "dgTMatrix")

uri <- tempfile()
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
fromSparseMatrix(spmat, uri)

chk <- toSparseMatrix(uri)
expect_true(is(chk, "sparseMatrix"))
expect_true(inherits(chk, "dgTMatrix"))
expect_true(all.equal(spmat, chk))
expect_equivalent(spmat, chk)
2 changes: 1 addition & 1 deletion man/fromDataFrame.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 58 additions & 0 deletions man/fromSparseMatrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b498493

Please sign in to comment.