diff --git a/NAMESPACE b/NAMESPACE index 00d56f6ad..d23d7763f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,13 @@ export(fsetequal) S3method(all.equal, data.table) export(shouldPrint) export(fsort) # experimental parallel sort for vector type double only, currently +# grouping sets +export(groupingsets) +export(cube) +export(rollup) +S3method(groupingsets, data.table) +S3method(cube, data.table) +S3method(rollup, data.table) S3method("[", data.table) S3method("[<-", data.table) diff --git a/NEWS.md b/NEWS.md index f43da50be..52b00d261 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,8 @@ ``` When you see the `..` prefix think _one-level-up_ like the directory `..` in all operating systems meaning the parent directory. In future the `..` prefix could be made to work on all symbols apearing anywhere inside `DT[...]`. It is intended to be a convenient way to protect your code from accidentally picking up a column name. Similar to how `x.` and `i.` prefixes (analogous to SQL table aliases) can already be used to disambiguate the same column name present in both `x` and `i`. A symbol prefix rather than a `..()` _function_ will be easier for us to optimize internally and more convenient if you have many variables in calling scope that you wish to use in your expressions safely. This feature was first raised in 2012 and long wished for, [#633](https://github.com/Rdatatable/data.table/issues/633). It is experimental. +3. Three new *Grouping Sets* functions: `rollup`, `cube` and `groupingsets`, [#1377](https://github.com/Rdatatable/data.table/issues/1377). Allows to aggregate data.table on various grouping levels at once producing sub-totals and grand total. + #### BUG FIXES #### NOTES diff --git a/R/groupingsets.R b/R/groupingsets.R new file mode 100644 index 000000000..2e8fb6eba --- /dev/null +++ b/R/groupingsets.R @@ -0,0 +1,121 @@ +rollup <- function(x, ...) { + UseMethod("rollup") +} +rollup.data.table <- function(x, j, by, .SDcols, id = FALSE, ...) { + # input data type basic validation + if (!is.data.table(x)) + stop("Argument 'x' must be data.table object") + if (!is.character(by)) + stop("Argument 'by' must be character vector of column names used in grouping.") + if (!is.logical(id)) + stop("Argument 'id' must be logical scalar.") + # generate grouping sets for rollup + sets = lapply(length(by):0, function(i) by[0:i]) + # redirect to workhorse function + jj = substitute(j) + groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj) +} + +cube <- function(x, ...) { + UseMethod("cube") +} +cube.data.table <- function(x, j, by, .SDcols, id = FALSE, ...) { + # input data type basic validation + if (!is.data.table(x)) + stop("Argument 'x' must be data.table object") + if (!is.character(by)) + stop("Argument 'by' must be character vector of column names used in grouping.") + if (!is.logical(id)) + stop("Argument 'id' must be logical scalar.") + # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 + n = length(by) + keepBool = sapply(2L^(1:n - 1L), function(k) rep(c(FALSE, TRUE), each=k, times=(2L^n / (2L*k)))) + sets = lapply((2L^n):1, function(j) by[keepBool[j, ]]) + # redirect to workhorse function + jj = substitute(j) + groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj) +} + +groupingsets <- function(x, ...) { + UseMethod("groupingsets") +} +groupingsets.data.table <- function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) { + # input data type basic validation + if (!is.data.table(x)) + stop("Argument 'x' must be data.table object") + if (ncol(x) < 1L) + stop("Argument 'x' is 0 column data.table, no measure to apply grouping over.") + if (length(names(x)) != uniqueN(names(x))) + stop("data.table must not contains duplicate column names.") + if (!is.character(by)) + stop("Argument 'by' must be character vector of column names used in grouping.") + if (length(by) != uniqueN(by)) + stop("Argument 'by' must have unique column names for grouping.") + if (!is.list(sets) || !all(sapply(sets, is.character))) + stop("Argument 'sets' must be a list of character vectors.") + if (!is.logical(id)) + stop("Argument 'id' must be logical scalar.") + # logic constraints validation + if (!all((sets.all.by <- unique(unlist(sets))) %chin% by)) + stop(sprintf("All columns used in 'sets' argument must be in 'by' too. Columns used in 'sets' but not present in 'by': %s.", paste(setdiff(sets.all.by, by), collapse=", "))) + if (id && "grouping" %chin% names(x)) + stop("When using `id=TRUE` the 'x' data.table must not have column named 'grouping'.") + if (!all(sapply(sets, function(x) length(x)==uniqueN(x)))) + stop("Character vectors in 'sets' list must not have duplicated column names within single grouping set.") + if (!identical(lapply(sets, sort), unique(lapply(sets, sort)))) + warning("Double counting is going to happen. Argument 'sets' should be unique without taking order into account, unless you really want double counting, then get used to that warning. Otherwise `sets=unique(lapply(sets, sort))` will do the trick.") + # input arguments handling + jj = if (!missing(jj)) jj else substitute(j) + av = all.vars(jj, TRUE) + if (":=" %chin% av) + stop("Expression passed to grouping sets function must not update by reference. Use ':=' on results of your grouping function.") + if (missing(.SDcols)) + .SDcols = if (".SD" %chin% av) setdiff(names(x), by) else NULL + # 0 rows template data.table to keep colorder and type + if (length(by)) { + empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by] + } else { + empty = if (length(.SDcols)) x[0L, eval(jj), .SDcols=.SDcols] else x[0L, eval(jj)] + if (!is.data.table(empty)) empty = setDT(list(empty)) # improve after #648, see comment in aggr.set + } + if (id && "grouping" %chin% names(empty)) # `j` could have been evaluated to `grouping` field + stop("When using `id=TRUE` the 'j' expression must not evaluate to column named 'grouping'.") + if (length(names(empty)) != uniqueN(names(empty))) + stop("There exists duplicated column names in the results, ensure the column passed/evaluated in `j` and those in `by` are not overlapping.") + # adding grouping column to template - aggregation level identifier + if (id) { + set(empty, j = "grouping", value = integer()) + setcolorder(empty, c("grouping", by, setdiff(names(empty), c("grouping", by)))) + } + # workaround for rbindlist fill=TRUE on integer64 #1459 + int64.cols = vapply(empty, inherits, logical(1), "integer64") + int64.cols = names(int64.cols)[int64.cols] + if (length(int64.cols) && !requireNamespace("bit64", quietly=TRUE)) + stop("Using integer64 class columns require to have 'bit64' package installed.") + int64.by.cols = intersect(int64.cols, by) + # aggregate function called for each grouping set + aggregate.set <- function(by.set) { + if (length(by.set)) { + r = if (length(.SDcols)) x[, eval(jj), by.set, .SDcols=.SDcols] else x[, eval(jj), by.set] + } else { + r = if (length(.SDcols)) x[, eval(jj), .SDcols=.SDcols] else x[, eval(jj)] + # workaround for grand total single var as data.table too, change to drop=FALSE after #648 solved + if (!is.data.table(r)) r = setDT(list(r)) + } + if (id) { + # integer bit mask of aggregation levels: http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE + set(r, j = "grouping", value = strtoi(paste(c("1", "0")[by %chin% by.set + 1L], collapse=""), base=2L)) + } + if (length(int64.by.cols)) { + # workaround for rbindlist fill=TRUE on integer64 #1459 + missing.int64.by.cols = setdiff(int64.by.cols, by.set) + if (length(missing.int64.by.cols)) r[, (missing.int64.by.cols) := bit64::as.integer64(NA)] + } + r + } + # actually processing everything here + rbindlist(c( + list(empty), # 0 rows template for colorder and type + lapply(sets, aggregate.set) # all aggregations + ), use.names=TRUE, fill=TRUE) +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 481d02bd1..93df5ff5b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8882,7 +8882,6 @@ test(1666.17, d1[d2, on="k", verbose=TRUE], ans, output='ad hoc') # reset defaults options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE) - #testing fix to #1654 (dcast should only error when _using_ duplicated names) DT <- data.table(a = 1:4, a = 1:4, id = rep(1:4, 2), V1 = 8:1) test(1667.1, dcast(DT, id ~ rowid(id), value.var = "V1"), @@ -9787,6 +9786,211 @@ indices(DT, vectors = TRUE) test(1749.1, indices(DT), c("A__B","A","B")) test(1749.2, indices(DT, vectors = TRUE), list(c("A","B"),"A","B")) +# Grouping Sets #1377 +n = 24L +set.seed(25) +dt <- data.table( + color = sample(c("green","yellow","red"), n, TRUE), + year = as.Date(sample(paste0(2011:2015,"-01-01"), n, TRUE)), + status = as.factor(sample(c("removed","active","inactive","archived"), n, TRUE)), + amount = sample(1:5, n, TRUE), + value = sample(c(3, 3.5, 2.5, 2), n, TRUE) +) +test(1750.1, # empty input gets grand total only when asked in `sets` with `character()` + groupingsets(dt[0L], j = sum(value), by = c("color","year","status"), sets=list(c("color"))), + data.table(color=character(), year=as.Date(NA)[-1L], status=factor(), V1=numeric()) +) +if (base::getRversion()>"3.0.0") { # affected by as.factor(NA) on 2.15.0 + test(1750.2, # empty input gets grand total non-NA, if asked + groupingsets(dt[0L], j = sum(value), by = c("color","year","status"), sets=list(c("color"), character())), + data.table(color=NA_character_, year=as.Date(NA), status=as.factor(NA), V1=0) + ) + test(1750.3, # empty input non-NA grand total, also retain classes and aggregation level + groupingsets(dt[0L], j = lapply(.SD, sum), by = c("color","year","status"), sets=list(c("color"), character()), id=TRUE), + data.table(grouping=7L, color=NA_character_, year=as.Date(NA), status=as.factor(NA), amount=0L, value=0) + ) +} +test(1750.4, # `sets=list()` produces 0 nrow, for grand total use `set=list(character())` - test at top + nrow(groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), sets=list(), id=TRUE)), + 0L +) +test(1750.5, # `by` must have unique column names + groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status","year"), .SDcols=c("amount","value"), sets=list("year"), id=TRUE), + error = "Argument 'by' must have unique column names" +) +test(1750.6, # 0 ncol `x` + groupingsets(data.table(), j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), sets=list(c("year")), id=TRUE), + error = "Argument 'x' is 0 column data.table, no measure to apply grouping over." +) +test(1750.7, # 0 length `by`, must also use `sets=list()`, so 0L rows result + nrow(groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = character(), .SDcols=c("amount","value"), sets=list(), id=TRUE)), + 0L +) +test(1750.8, all( # for any single value from dataset there should be always same aggregate result on any level of grouping + sapply(seq_len(nrow(dt)), function(i) uniqueN( + groupingsets(dt[i], j = lapply(.SD, sum), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character())), + by=c("amount","value") + )) == 1L +), TRUE) +# all grouping id matches in all totals +r = groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character()), id=TRUE) +test(1750.9, uniqueN( + r[, lapply(.SD, sum), by = "grouping", .SDcols = c("cnt","amount","value")], + by = c("cnt","amount","value") +), 1L) +# groupingsets grouping by 'value' still possible +r = groupingsets(dt, j = sum(amount), by = c("color","year","status","value"), sets=list(c("color","year","status"), c("year"), c("status"), character())) +test(1750.10, + sapply(r, class), + c("color"="character","year"="Date","status"="factor","value"="numeric","V1"="integer") +) +# groupingsets on aggregate using grouping col char type and sum - error +test(1750.11, + groupingsets(dt, j = lapply(.SD, sum), by = c("status","year"), sets=list(character()), .SDcols="color"), + error = "invalid 'type' (character) of argument" +) +# groupingsets on aggregate using grouping col factor type and sum - error +test(1750.12, + groupingsets(dt, j = lapply(.SD, sum), by = c("color","year"), sets=list(character()), .SDcols="status"), + error = "not meaningful for factors" +) +# groupingsets on aggregate using grouping col char type and length, match on all subtotals +r = groupingsets(dt, j = lapply(.SD, length), by = c("status","year"), sets=list(c("year"), c("status","year"), character()), .SDcols="color", id=TRUE) +test(1750.13, uniqueN( + r[, lapply(.SD, sum), by = "grouping", .SDcols = c("color")], + by = c("color") +), 1L) +# groupingsets double listing column, to measure and grouping +test(1750.14, + groupingsets(dt, j = lapply(.SD, sum), by = c("color","amount"), sets=list(c("color"), c("color","amount")), .SDcols="amount", id=TRUE), + error = "There exists duplicated column names in the results" +) +test(1750.15, + groupingsets(dt, j = .(color = sum(value)), by = c("color","amount"), sets=list(c("color"), c("color","amount")), id=TRUE), + error = "There exists duplicated column names in the results" +) +# set equals to `character(0)` should return grand total +if (base::getRversion()>"3.0.0") { # affected by as.factor(NA) on 2.15.0 + sets = list(character()) + test(1750.16, + groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE), + dt[, c(list(cnt=.N), lapply(.SD, sum)), .(grouping=rep(7L,n), color=rep(NA_character_,n), year=rep(as.Date(NA),n), status=as.factor(rep(NA_character_,n)))] + ) +} +# duplicate entries in `sets` vector-wise +sets = list("color", c("color","year","status","year","status"), "year", character()) +test(1750.17, + groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE), + error = "Character vectors in 'sets' list must not have duplicated column names within single grouping set." +) +# duplicate entries in `sets` - double counting - actually aggregate `grouping!=5L` (not double counted) to compare to double counted values on `grouping==5L`, as double counting is expected results for this unexpected usage +sets = list("year", c("color","year"), "year", character()) +test(1750.18, uniqueN({ + r <- groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE) + r[, lapply(.SD, sum), by = .(double_counting = grouping==5L, double_counting = grouping!=5L), .SDcols = c("cnt","amount","value")] +}, by = c("cnt","amount","value") +), 1L, warning = "Double counting is going to happen") +# duplicate entries in `sets` but reorderd - double counting on `grouping==1L` +sets = list(c("color","year"), "year", c("year","color"), character()) +test(1750.19, uniqueN({ + r <- groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE) + r[, lapply(.SD, sum), by = .(double_counting = grouping==1L, double_counting = grouping!=1L), .SDcols = c("cnt","amount","value")] +}, by = c("cnt","amount","value") +), 1L, warning = "Double counting is going to happen") +# entries in `by` / `sets` not exists in data.table +test(1750.20, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","stat"), sets=list(c("color"), character()), id=TRUE), error = "object 'stat' not found") +test(1750.21, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color"), "stat"), id=TRUE), error = "Columns used in 'sets' but not present in 'by': stat") +test(1750.22, groupingsets(dt, j = .(a=sum(a)), by = c("color","year","status"), sets=list(c("color"), character()), id=TRUE), error = "object 'a' not found") +# update by ref `:=` forbidden +test(1750.23, + groupingsets(dt, j = sum_value := sum(value), by = c("color","year","status"), sets=list(c("color"), character())), + error = "Expression passed to grouping sets function must not update by reference." +) +# rollup +sets = local({ + by=c("color","year","status") + lapply(length(by):0, function(i) by[0:i]) +}) +test(1750.31, + rollup(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), id=TRUE), + groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE) +) +sets = local({ + by=c("year","status") + lapply(length(by):0, function(i) by[0:i]) +}) +test(1750.32, + rollup(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), id=TRUE), + groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), sets=sets, id=TRUE) +) +# cube +sets = local({ + by=c("color","year","status") + n = length(by) + keepBool = sapply(2L^(1:n - 1L), function(k) rep(c(FALSE, TRUE), each=k, times=(2L^n / (2L*k)))) + lapply((2L^n):1, function(j) by[keepBool[j, ]]) +}) +test(1750.33, + cube(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), id=TRUE), + groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE) +) +sets = local({ + by=c("year","status") + n = length(by) + keepBool = sapply(2L^(1:n - 1L), function(k) rep(c(FALSE, TRUE), each=k, times=(2L^n / (2L*k)))) + lapply((2L^n):1, function(j) by[keepBool[j, ]]) +}) +test(1750.34, + cube(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), id=TRUE), + groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), sets=sets, id=TRUE) +) +# grouping sets with integer64 +if ("package:bit64" %in% search()) { + set.seed(26) + dt[, c("id1","id2") := list(as.integer64(sample(sample(n, n/4), n, TRUE)), as.integer64(sample(sample(n, n/2), n, TRUE)))] + # int64 as grouping cols + r = groupingsets(dt, j = lapply(.SD, sum), by = c("color","id1","id2"), sets=list(c("color","id1"), c("color","id1","id2"), "id2", c("id1","id2"), "color", character()), .SDcols=c("amount","value"), id=TRUE) + test(1750.41, # grand total + r[grouping==7L, .(color, id1, id2, amount, value)], + dt[, lapply(.SD, sum), .(color=rep(NA_character_, n), id1=as.integer64(rep(NA,n)), id2=as.integer64(rep(NA,n))), .SDcols=c("amount","value")] + ) + test(1750.42, # by color + r[grouping==3L, .(color, id1, id2, amount, value)], + dt[, lapply(.SD, sum), .(color, id1=as.integer64(rep(NA,n)), id2=as.integer64(rep(NA,n))), .SDcols=c("amount","value")] + ) + test(1750.43, # by id2 + r[grouping==6L, .(color, id1, id2, amount, value)], + dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), id1=as.integer64(rep(NA,n)), id2), .SDcols=c("amount","value")] + ) + test(1750.44, # by id1, id2 + r[grouping==4L, .(color, id1, id2, amount, value)], + dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), id1, id2), .SDcols=c("amount","value")] + ) + # int64 as measure cols + r = groupingsets(dt, j = lapply(.SD, sum), by = c("color","status"), sets=list(c("color","status"), "status", character()), .SDcols=c("amount","value","id1","id2"), id=TRUE) + if (base::getRversion()>"3.0.0") { # affected by as.factor(NA) on 2.15.0 + test(1750.45, # grand total + r[grouping==3L, .(color, status, amount, value, id1, id2)], + dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), status=as.factor(rep(NA_character_,n))), .SDcols=c("amount","value","id1","id2")] + ) + } + test(1750.46, # by status + r[grouping==2L, .(color, status, amount, value, id1, id2)], + dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), status), .SDcols=c("amount","value","id1","id2")] + ) + # int64 as grouping and measure cols + r = groupingsets(dt, j = lapply(.SD, sum), by = c("color","id1"), sets=list(c("color","id1"), "id1", character()), .SDcols=c("amount","value","id2"), id=TRUE) + test(1750.47, # grand total + r[grouping==3L, .(color, id1, amount, value, id2)], + dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), id1=as.integer64(rep(NA,n))), .SDcols=c("amount","value","id2")] + ) + test(1750.48, # by id1 + r[grouping==2L, .(color, id1, amount, value, id2)], + dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), id1), .SDcols=c("amount","value","id2")] + ) +} else { + cat("Test 1750.[41-48] not run. If required call library(bit64) first.\n") +} ########################## diff --git a/man/groupingsets.Rd b/man/groupingsets.Rd new file mode 100644 index 000000000..38100b2a7 --- /dev/null +++ b/man/groupingsets.Rd @@ -0,0 +1,68 @@ +\name{groupingsets} +\alias{rollup} +\alias{cube} +\alias{groupingsets} +\alias{rollup.data.table} +\alias{cube.data.table} +\alias{groupingsets.data.table} +\title{ Grouping Set aggregation for data tables } +\description{ + Calculate aggregates at various levels of groupings producing multiple (sub-)totals. Reflects SQLs \emph{GROUPING SETS} operations. +} +\usage{ +rollup(x, \dots) +\method{rollup}{data.table}(x, j, by, .SDcols, id = FALSE, \dots) +cube(x, \dots) +\method{cube}{data.table}(x, j, by, .SDcols, id = FALSE, \dots) +groupingsets(x, \dots) +\method{groupingsets}{data.table}(x, j, by, sets, .SDcols, id = FALSE, jj, \dots) +} +\arguments{ + \item{x}{\code{data.table}.} + \item{\dots}{argument passed to custom user methods. Ignored for \code{data.table} methods.} + \item{j}{expression passed to data.table \code{j}.} + \item{by}{character column names by which we are grouping.} + \item{sets}{list of character vector reflectings grouping sets, used in \code{groupingsets} for flexibility.} + \item{.SDcols}{columns to be used in \code{j} expression in \code{.SD} object.} + \item{id}{logical default \code{FALSE}. If \code{TRUE} it will add leading column with bit mask of grouping sets.} + \item{jj}{quoted version of \code{j} argument, for convenience. When provided function will ignore \code{j} argument.} +} +\details{ + All three functions \code{rollup, cube, groupingsets} are generic methods, \code{data.table} methods are provided. +} +\value{ + A data.table with various aggregates. +} +\seealso{ \code{\link{data.table}}, \code{\link{rbindlist}} +} +\references{ +\url{http://www.postgresql.org/docs/9.5/static/queries-table-expressions.html#QUERIES-GROUPING-SETS} +\url{http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE} +} +\examples{ +n = 24L +set.seed(25) +dt <- data.table( + color = sample(c("green","yellow","red"), n, TRUE), + year = as.Date(sample(paste0(2011:2015,"-01-01"), n, TRUE)), + status = as.factor(sample(c("removed","active","inactive","archived"), n, TRUE)), + amount = sample(1:5, n, TRUE), + value = sample(c(3, 3.5, 2.5, 2), n, TRUE) +) + +# rollup +rollup(dt, j = sum(value), by = c("color","year","status")) # default id=FALSE +rollup(dt, j = sum(value), by = c("color","year","status"), id=TRUE) +rollup(dt, j = lapply(.SD, sum), by = c("color","year","status"), id=TRUE, .SDcols="value") +rollup(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), id=TRUE) + +# cube +cube(dt, j = sum(value), by = c("color","year","status"), id=TRUE) +cube(dt, j = lapply(.SD, sum), by = c("color","year","status"), id=TRUE, .SDcols="value") +cube(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), id=TRUE) + +# groupingsets +groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), + sets = list("color", c("year","status"), character()), id=TRUE) +} +\keyword{ data }