Autoname j (#3802)

Rdatatable · Nov 14, 2019 · 883a136 · 883a136
1 parent 412a781
commit 883a136
Show file tree

Hide file tree

Showing 7 changed files with 70 additions and 37 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -6,6 +6,8 @@
 
 ## NEW FEATURES
 
+1. `DT[, {...; .(A,B)}]` (when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does.
+
 ## BUG FIXES
 
 ## NOTES

diff --git a/R/data.table.R b/R/data.table.R
@@ -81,7 +81,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str
     }
   }
   if (isTRUE(stringsAsFactors)) {
-    for (j in which(vapply(ans, is.character, TRUE))) set(ans, NULL, j, as_factor(.subset2(ans, j)))
+    for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j)))
     # as_factor is internal function in fread.R currently
   }
   setalloccol(ans)  # returns a NAMED==0 object, unlike data.frame()
@@ -854,29 +854,54 @@ replace_dot_alias = function(e) {
       }
 
       jvnames = NULL
-      if (is.name(jsub)) {
-        # j is a single unquoted column name
-        if (jsub!=".SD") {
-          jvnames = gsub("^[.](N|I|GRP|BY)$","\\1",as.character(jsub))
-          # jsub is list()ed after it's eval'd inside dogroups.
-        }
-      } else if (is.call(jsub) && as.character(jsub[[1L]])[[1L]] %chin% c("list",".")) {
-        jsub[[1L]] = quote(list)
-        jsubl = as.list.default(jsub)  # TO DO: names(jsub) and names(jsub)="" seem to work so make use of that
-        if (length(jsubl)>1L) {
-          jvnames = names(jsubl)[-1L]   # check list(a=sum(v),v)
-          if (is.null(jvnames)) jvnames = rep.int("", length(jsubl)-1L)
-          for (jj in seq.int(2L,length(jsubl))) {
-            if (jvnames[jj-1L] == "" && mode(jsubl[[jj]])=="name") {
-              if (jsubl[[jj]]=="") stop("Item ", jj-1L, " of the .() or list() passed to j is missing") #3507
-              jvnames[jj-1L] = gsub("^[.](N|I|GRP|BY)$", "\\1", deparse(jsubl[[jj]]))
+      drop_dot = function(x) {
+        tt = x %chin% c(".N",".I",".GRP",".BY")
+        if (any(tt)) x[tt] = substring(x[tt], 2L)
+        x
+      }
+      # handle auto-naming of last item of j (e.g. within {} or if/else, #2478)
+      #   e.g. DT[, .(a=sum(v), v, .N), by=] should create columns named a, v, N
+      do_j_names = function(q) {
+        if (!is.call(q) || !is.name(q[[1L]])) return(q)
+        if (as.character(q[[1L]]) %chin% c('list', '.')) {
+          q[[1L]] = quote(list)
+          qlen = length(q)
+          if (qlen>1L) {
+            nm = names(q[-1L])   # check list(a=sum(v),v)
+            if (is.null(nm)) nm = rep.int("", qlen-1L)
+            # attempt to auto-name unnamed columns
+            idx = which(!nzchar(nm))
+            for (jj in idx) {
+              thisq = q[[jj + 1L]]
+              if (missing(thisq)) stop("Item ", jj, " of the .() or list() passed to j is missing") #3507
+              if (is.name(thisq)) nm[jj] = as.character(thisq)
+              # TO DO: if call to a[1] for example, then call it 'a' too
             }
-            # TO DO: if call to a[1] for example, then call it 'a' too
+            nm[idx] = drop_dot(nm[idx])
+            if (!is.null(jvnames) && any(idx <- nm != jvnames))
+              warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE)
+            jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better
+            setattr(q, "names", NULL)  # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result.
           }
-          setattr(jsubl, "names", NULL)  # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result.
-          jsub = as.call(jsubl)
-        } # else empty list is needed for test 468: adding an empty list column
-      } # else maybe a call to transform or something which returns a list.
+          return(q) # else empty list is needed for test 468: adding an empty list column
+        }
+        if (q[[1L]] == '{') {
+          q[[length(q)]] = do_j_names(q[[length(q)]])
+          return(q)
+        }
+        if (q[[1L]] == 'if') {
+          #explicit NULL would return NULL, assigning NULL would delete that from the expression
+          if (!is.null(q[[3L]])) q[[3L]] = do_j_names(q[[3L]])
+          if (length(q) == 4L && !is.null(q[[4L]])) q[[4L]] = do_j_names(q[[4L]])
+          return(q)
+        }
+        return(q)
+      }
+      if (is.name(jsub)) {
+        # j is a single unquoted column name
+        if (jsub!=".SD") jvnames = drop_dot(as.character(jsub))
+        # jsub is list()ed after it's eval'd inside dogroups.
+      } else jsub = do_j_names(jsub) # else maybe a call to transform or something which returns a list.
       av = all.vars(jsub,TRUE)  # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c)
       use.I = ".I" %chin% av
       if (any(c(".SD","eval","get","mget") %chin% av)) {
@@ -1441,7 +1466,7 @@ replace_dot_alias = function(e) {
   lockBinding(".iSD",SDenv)
 
   GForce = FALSE
-  if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[[1L]] %chin% c(".SD",".N"))) ) {  # Ability to turn off if problems or to benchmark the benefit
+  if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[1L] %chin% c(".SD", ".N"))) ) {  # Ability to turn off if problems or to benchmark the benefit
     # Optimization to reduce overhead of calling lapply over and over for each group
     oldjsub = jsub
     funi = 1L # Fix for #985
@@ -1600,7 +1625,7 @@ replace_dot_alias = function(e) {
     if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) {
       if (!length(ansvars) && !use.I) {
         GForce = FALSE
-        if ( (is.name(jsub) && jsub == ".N") || (is.call(jsub) && length(jsub)==2L && length(as.character(jsub[[1L]])) && as.character(jsub[[1L]])[1L] == "list" && length(as.character(jsub[[2L]])) && as.character(jsub[[2L]])[1L] == ".N") ) {
+        if ( (is.name(jsub) && jsub == ".N") || (is.call(jsub) && length(jsub)==2L && jsub[[1L]]== "list" && jsub[[2L]] == ".N") ) {
           GForce = TRUE
           if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="")
         }
@@ -2293,7 +2318,7 @@ copy = function(x) {
   if (!is.data.table(x)) {
     # fix for #1476. TODO: find if a cleaner fix is possible..
     if (is.list(x)) {
-      anydt = vapply(x, is.data.table, TRUE, USE.NAMES=FALSE)
+      anydt = vapply_1b(x, is.data.table, use.names=FALSE)
       if (sum(anydt)) {
         newx[anydt] = lapply(newx[anydt], function(x) {
           .Call(C_unlock, x)
@@ -2591,7 +2616,7 @@ setDF = function(x, rownames=NULL) {
     }
     x
   } else {
-    n = vapply(x, length, 0L)
+    n = vapply_1i(x, length)
     mn = max(n)
     if (any(n<mn))
       stop("All elements in argument 'x' to 'setDF' must be of same length")
@@ -2671,7 +2696,7 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
                                   # fail for NULL columns will give helpful error at that point, #3480 and #3471
       if (inherits(x[[i]], "POSIXlt")) stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.")
     }
-    n = vapply(x, length, 0L)
+    n = vapply_1i(x, length)
     n_range = range(n)
     if (n_range[1L] != n_range[2L]) {
       tbl = sort(table(n))
@@ -2892,7 +2917,7 @@ isReallyReal = function(x) {
   }
   if (length(i) == 0L) stop("Internal error in .isFastSubsettable. Please report to data.table developers") # nocov
   ## convert i to data.table with all combinations in rows.
-  if(length(i) > 1L && prod(vapply(i, length, integer(1L))) > 1e4){
+  if(length(i) > 1L && prod(vapply_1i(i, length)) > 1e4){
     ## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635
     if (verbose) {cat("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()}
     return(NULL)

diff --git a/R/fread.R b/R/fread.R
@@ -312,9 +312,9 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir())
   if (stringsAsFactors) {
     if (is.double(stringsAsFactors)) { #2025
       should_be_factor = function(v) is.character(v) && uniqueN(v) < nr * stringsAsFactors
-      cols_to_factor = which(vapply(ans, should_be_factor, logical(1L)))
+      cols_to_factor = which(vapply_1b(ans, should_be_factor))
     } else {
-      cols_to_factor = which(vapply(ans, is.character, logical(1L)))
+      cols_to_factor = which(vapply_1b(ans, is.character))
     }
     if (verbose) cat("stringsAsFactors=", stringsAsFactors, " converted ", length(cols_to_factor), " column(s): ", brackify(names(ans)[cols_to_factor]), "\n", sep="")
     for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j)))

diff --git a/R/groupingsets.R b/R/groupingsets.R
@@ -83,7 +83,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...)
     setcolorder(empty, c("grouping", by, setdiff(names(empty), c("grouping", by))))
   }
   # workaround for rbindlist fill=TRUE on integer64 #1459
-  int64.cols = vapply(empty, inherits, logical(1L), "integer64")
+  int64.cols = vapply_1b(empty, inherits, "integer64")
   int64.cols = names(int64.cols)[int64.cols]
   if (length(int64.cols) && !requireNamespace("bit64", quietly=TRUE))
     stop("Using integer64 class columns require to have 'bit64' package installed.") # nocov

diff --git a/R/print.data.table.R b/R/print.data.table.R
@@ -81,7 +81,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
       factor = "<fctr>", POSIXct = "<POSc>", logical = "<lgcl>",
       IDate = "<IDat>", integer64 = "<i64>", raw = "<raw>",
       expression = "<expr>", ordered = "<ord>")
-    classes = vapply(x, function(col) class(col)[1L], "", USE.NAMES=FALSE)
+    classes = vapply_1c(x, function(col) class(col)[1L], use.names=FALSE)
     abbs = unname(class_abb[classes])
     if ( length(idx <- which(is.na(abbs))) ) abbs[idx] = paste0("<", classes[idx], ">")
     toprint = rbind(abbs, toprint)

diff --git a/R/setops.R b/R/setops.R
@@ -41,8 +41,7 @@ funique = function(x) {
   if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have the same column names")
   if (!identical(names(x), names(y))) stop("x and y must have the same column order")
   bad_types = c("raw", "complex", if (block_list) "list")
-  found = bad_types %chin% c(vapply(x, typeof, FUN.VALUE = ""),
-                             vapply(y, typeof, FUN.VALUE = ""))
+  found = bad_types %chin% c(vapply_1c(x, typeof), vapply_1c(y, typeof))
   if (any(found)) stop("unsupported column type", if (sum(found) > 1L) "s" else "",
                        " found in x or y: ", brackify(bad_types[found]))
   super = function(x) {
@@ -176,7 +175,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu
   if (ignore.row.order) {
     if (".seqn" %chin% names(target))
       stop("None of the datasets to compare should contain a column named '.seqn'")
-    bad.type = setNames(c("raw","complex","list") %chin% c(vapply(current, typeof, FUN.VALUE = ""), vapply(target, typeof, FUN.VALUE = "")), c("raw","complex","list"))
+    bad.type = setNames(c("raw","complex","list") %chin% c(vapply_1c(current, typeof), vapply_1c(target, typeof)), c("raw","complex","list"))
     if (any(bad.type))
       stop("Datasets to compare with 'ignore.row.order' must not have unsupported column types: ", brackify(names(bad.type)[bad.type]))
     if (between(tolerance, 0, sqrt(.Machine$double.eps), incbounds=FALSE)) {

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -10271,11 +10271,11 @@ test(1725, DT, data.table(a=1:3, add0=NA_real_, add1=c(NA,NA,1.1), add2=c(NA,NA,
 DT = data.table(grp=rep(3:1,each=3), val=1:9)
 lastGrp = 0L
 test(1726.1, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, keyby=grp],
-           data.table(grp=1:3, V1=c(8,12,6), V2=1:3, key="grp") )
+           data.table(grp=1:3, ans=c(8,12,6), GRP=1:3, key="grp") )
 test(1726.2, lastGrp, 1L)
 lastGrp = -1L
 test(1726.3, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, by=grp],
-           data.table(grp=3:1, V1=c(1,6,12), V2=1:3) )
+           data.table(grp=3:1, ans=c(1,6,12), GRP=1:3) )
 test(1726.4, lastGrp, 7L)
 rm(lastGrp)
 
@@ -16370,6 +16370,13 @@ test(2120.07, iDT[(i_id), order(e_date, e_time)], c(3L,4L,1L,2L))  # wrapping wi
 test(2120.08, tmp[iDT[(i_id), order(e_date, e_time)]],             # different result with the NA
               data.table(i_id=c("A",NA,"B","C"), N=c(5L,NA,5L,5L)))
 
+# auto-name .() when it's the last item of {...} or wrapped with if(), #2478 #609
+DT = data.table(a = c(1, 1, 2), b = 4:6)
+test(2121.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)])
+test(2121.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)])
+test(2121.3, DT[ , if (.N > 1L) .(b), by=a], DT[1:2])
+test(2121.4, DT[ , if (.N > 1L) .(b) else .(c=b), by=a], DT[ , .(a, c=b)],
+     warning="Different branches of j expression produced different auto-named columns")
 
 ###################################
 #  Add new tests above this line  #