Skip to content

Commit

Permalink
new faster and more flexible split.data.table, closes #1389, #448
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Mar 19, 2016
1 parent c9f500d commit 5f7a435
Show file tree
Hide file tree
Showing 6 changed files with 587 additions and 7 deletions.
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export(fintersect)
export(fsetdiff)
export(funion)
export(fsetequal)
S3method(all.equal, data.table)

S3method("[", data.table)
S3method("[<-", data.table)
Expand Down Expand Up @@ -62,6 +63,7 @@ S3method(as.matrix, data.table)
#S3method(cbind, data.table)
#S3method(rbind, data.table)
export(.rbind.data.table)
S3method(split, data.table)
S3method(dim, data.table)
S3method(dimnames, data.table)
S3method("dimnames<-", data.table)
Expand All @@ -75,7 +77,6 @@ S3method(within, data.table)
S3method(is.na, data.table)
S3method(format, data.table)
S3method(Ops, data.table)
S3method(all.equal, data.table)

S3method(anyDuplicated, data.table)

Expand Down
73 changes: 68 additions & 5 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -2171,11 +2171,74 @@ Ops.data.table <- function(e1, e2 = NULL)
ans
}


split.data.table <- function(...) {
if (cedta() && getOption("datatable.dfdispatchwarn")) # or user can use suppressWarnings
warning("split is inefficient. It copies memory. Please use [,j,by=list(...)] syntax. See data.table FAQ.")
NextMethod() # allow user to do it though, split object will be data.table's with 'NA' repeated in row.names silently
split.data.table <- function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TRUE, flatten = TRUE, ..., verbose = getOption("datatable.verbose")) {
if (!is.data.table(x)) stop("x argument must be a data.table")
stopifnot(is.logical(drop), is.logical(sorted), is.logical(keep.by), is.logical(flatten))
# split data.frame way, using `f` and not `by` argument
if (!missing(f)) {
if (!length(f) && nrow(x))
stop("group length is 0 but data nrow > 0")
if (!missing(by))
stop("passing 'f' argument together with 'by' is not allowed, use 'by' when split by column in data.table and 'f' when split by external factor")
# same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency
return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind]))
}
if (missing(by)) stop("you must provide 'by' or 'f' arguments")
# check reserved column names during processing
if (".ll.tech.split" %in% names(x)) stop("column '.ll.tech.split' is reserved for split.data.table processing")
if (".nm.tech.split" %in% by) stop("column '.nm.tech.split' is reserved for split.data.table processing")
if (!all(by %in% names(x))) stop("argument 'by' must refer to data.table column names")
if (!all(by.atomic <- sapply(by, function(.by) is.atomic(x[[.by]])))) stop(sprintf("argument 'by' must refer only to atomic type columns, classes of '%s' columns are not atomic type", paste(by[!by.atomic], collapse=", ")))
# list of data.tables (flatten) or list of lists of ... data.tables
make.levels = function(x, cols, sorted) {
by.order = if (!sorted) x[, funique(.SD), .SDcols=cols] # remember order of data, only when not sorted=FALSE
ul = lapply(setNames(nm=cols), function(col) if (!is.factor(x[[col]])) unique(x[[col]]) else levels(x[[col]]))
r = do.call("CJ", c(ul, sorted=sorted, unique=TRUE))
if (!sorted && nrow(by.order)) {
ii = r[by.order, on=cols, which=TRUE]
r = rbindlist(list(
r[ii], # original order from data
r[-ii] # empty levels at the end
))
}
r
}
.by = by[1L]
# this builds data.table call - is much more cleaner than handling each case one by one
dtq = as.list(call("[", as.name("x")))
join = FALSE
flatten_any = flatten && any(sapply(by, function(col) is.factor(x[[col]])))
nested_current = !flatten && is.factor(x[[.by]])
if (!drop && (flatten_any || nested_current)) {
dtq[["i"]] = substitute(make.levels(x, cols=.cols, sorted=.sorted), list(.cols=if (flatten) by else .by, .sorted=sorted))
join = TRUE
}
dtq[["j"]] = substitute(
list(.ll.tech.split=list(.expr)),
list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) # simplify when `nomatch` accept NULL #857 ?
)
by.or.keyby = if (join) "by" else c("by"[!sorted], "keyby"[sorted])[1L]
dtq[[by.or.keyby]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`.
.expr,
list(.expr = if(join) as.name(".EACHI") else if (flatten) by else .by)
)
dtq[[".SDcols"]] = if (keep.by) names(x) else setdiff(names(x), if (flatten) by else .by)
if (join) dtq[["on"]] = if (flatten) by else .by
dtq = as.call(dtq)
if (isTRUE(verbose)) cat("Processing split.data.table with: ", deparse(dtq, width.cutoff=500L), "\n", sep="")
tmp = eval(dtq)
# add names on list
setattr(ll <- tmp$.ll.tech.split,
"names",
as.character(
if (!flatten) tmp[[.by]] else tmp[, list(.nm.tech.split=paste(unlist(.SD), collapse = ".")), by=by, .SDcols=by]$.nm.tech.split
))
# handle nested split
if (flatten || length(by) == 1L) return(
lapply(ll, setattr, '.data.table.locked', NULL)
) else if (length(by) > 1L) return(
lapply(ll, split.data.table, drop=drop, by=by[-1L], sorted=sorted, keep.by=keep.by, flatten=flatten)
)
}

# TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@
27. `by` understands `colA:colB` syntax now, like `.SDcols` does, [#1395](https://github.com/Rdatatable/data.table/issues/1395). Thanks @franknarf1.

28. Joins (and binary search based subsets) using `on=` argument now reuses existing (secondary) indices, [#1439](https://github.com/Rdatatable/data.table/issues/1439). Thanks @jangorecki.

29. New `split` method for data.table. Faster, more flexible and consistent with data.frame method. Closes [#1389](https://github.com/Rdatatable/data.table/issues/1389).

#### BUG FIXES

Expand Down
Loading

0 comments on commit 5f7a435

Please sign in to comment.