new faster and more flexible split.data.table, closes #1389, #448

Rdatatable · Mar 19, 2016 · 5f7a435 · 5f7a435
1 parent c9f500d
commit 5f7a435
Show file tree

Hide file tree

Showing 6 changed files with 587 additions and 7 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -35,6 +35,7 @@ export(fintersect)
 export(fsetdiff)
 export(funion)
 export(fsetequal)
+S3method(all.equal, data.table)
 
 S3method("[", data.table)
 S3method("[<-", data.table)
@@ -62,6 +63,7 @@ S3method(as.matrix, data.table)
 #S3method(cbind, data.table)
 #S3method(rbind, data.table)
 export(.rbind.data.table)
+S3method(split, data.table)
 S3method(dim, data.table)
 S3method(dimnames, data.table)
 S3method("dimnames<-", data.table)
@@ -75,7 +77,6 @@ S3method(within, data.table)
 S3method(is.na, data.table)
 S3method(format, data.table)
 S3method(Ops, data.table)
-S3method(all.equal, data.table)
 
 S3method(anyDuplicated, data.table)
 

diff --git a/R/data.table.R b/R/data.table.R
@@ -2171,11 +2171,74 @@ Ops.data.table <- function(e1, e2 = NULL)
     ans
 }
 
-
-split.data.table <- function(...) {
-    if (cedta() && getOption("datatable.dfdispatchwarn"))  # or user can use suppressWarnings
-        warning("split is inefficient. It copies memory. Please use [,j,by=list(...)] syntax. See data.table FAQ.")
-    NextMethod()  # allow user to do it though, split object will be data.table's with 'NA' repeated in row.names silently
+split.data.table <- function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TRUE, flatten = TRUE, ..., verbose = getOption("datatable.verbose")) {
+    if (!is.data.table(x)) stop("x argument must be a data.table")
+    stopifnot(is.logical(drop), is.logical(sorted), is.logical(keep.by),  is.logical(flatten))
+    # split data.frame way, using `f` and not `by` argument
+    if (!missing(f)) {
+        if (!length(f) && nrow(x))
+            stop("group length is 0 but data nrow > 0")
+        if (!missing(by))
+            stop("passing 'f' argument together with 'by' is not allowed, use 'by' when split by column in data.table and 'f' when split by external factor")
+        # same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency
+        return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind]))
+    }
+    if (missing(by)) stop("you must provide 'by' or 'f' arguments")
+    # check reserved column names during processing
+    if (".ll.tech.split" %in% names(x)) stop("column '.ll.tech.split' is reserved for split.data.table processing")
+    if (".nm.tech.split" %in% by) stop("column '.nm.tech.split' is reserved for split.data.table processing")
+    if (!all(by %in% names(x))) stop("argument 'by' must refer to data.table column names")
+    if (!all(by.atomic <- sapply(by, function(.by) is.atomic(x[[.by]])))) stop(sprintf("argument 'by' must refer only to atomic type columns, classes of '%s' columns are not atomic type", paste(by[!by.atomic], collapse=", ")))
+    # list of data.tables (flatten) or list of lists of ... data.tables
+    make.levels = function(x, cols, sorted) {
+        by.order = if (!sorted) x[, funique(.SD), .SDcols=cols] # remember order of data, only when not sorted=FALSE
+        ul = lapply(setNames(nm=cols), function(col) if (!is.factor(x[[col]])) unique(x[[col]]) else levels(x[[col]]))
+        r = do.call("CJ", c(ul, sorted=sorted, unique=TRUE))
+        if (!sorted && nrow(by.order)) {
+            ii = r[by.order, on=cols, which=TRUE]
+            r = rbindlist(list(
+                r[ii], # original order from data
+                r[-ii] # empty levels at the end
+            ))
+        }
+        r
+    }
+    .by = by[1L]
+    # this builds data.table call - is much more cleaner than handling each case one by one
+    dtq = as.list(call("[", as.name("x")))
+    join = FALSE
+    flatten_any = flatten && any(sapply(by, function(col) is.factor(x[[col]])))
+    nested_current = !flatten && is.factor(x[[.by]])
+    if (!drop && (flatten_any || nested_current)) {
+        dtq[["i"]] = substitute(make.levels(x, cols=.cols, sorted=.sorted), list(.cols=if (flatten) by else .by, .sorted=sorted))
+        join = TRUE
+    }
+    dtq[["j"]] = substitute(
+        list(.ll.tech.split=list(.expr)),
+        list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) # simplify when `nomatch` accept NULL #857 ?
+    )
+    by.or.keyby = if (join) "by" else c("by"[!sorted], "keyby"[sorted])[1L]
+    dtq[[by.or.keyby]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`.
+        .expr,
+        list(.expr = if(join) as.name(".EACHI") else if (flatten) by else .by)
+    )
+    dtq[[".SDcols"]] = if (keep.by) names(x) else setdiff(names(x), if (flatten) by else .by)
+    if (join) dtq[["on"]] = if (flatten) by else .by
+    dtq = as.call(dtq)
+    if (isTRUE(verbose)) cat("Processing split.data.table with: ", deparse(dtq, width.cutoff=500L), "\n", sep="")
+    tmp = eval(dtq)
+    # add names on list
+    setattr(ll <- tmp$.ll.tech.split,
+            "names", 
+            as.character(
+                if (!flatten) tmp[[.by]] else tmp[, list(.nm.tech.split=paste(unlist(.SD), collapse = ".")), by=by, .SDcols=by]$.nm.tech.split
+            ))
+    # handle nested split
+    if (flatten || length(by) == 1L) return(
+        lapply(ll, setattr, '.data.table.locked', NULL)
+    ) else if (length(by) > 1L) return(
+        lapply(ll, split.data.table, drop=drop, by=by[-1L], sorted=sorted, keep.by=keep.by, flatten=flatten)
+    )
 }
 
 # TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want

diff --git a/README.md b/README.md
@@ -71,6 +71,8 @@
   27. `by` understands `colA:colB` syntax now, like `.SDcols` does, [#1395](https://github.com/Rdatatable/data.table/issues/1395). Thanks @franknarf1.
 
   28. Joins (and binary search based subsets) using `on=` argument now reuses existing (secondary) indices, [#1439](https://github.com/Rdatatable/data.table/issues/1439). Thanks @jangorecki.
+
+  29. New `split` method for data.table. Faster, more flexible and consistent with data.frame method. Closes [#1389](https://github.com/Rdatatable/data.table/issues/1389).
 
 #### BUG FIXES