Skip to content

Commit

Permalink
Autoname j (#3802)
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelChirico authored and mattdowle committed Nov 14, 2019
1 parent 412a781 commit 883a136
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 37 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

## NEW FEATURES

1. `DT[, {...; .(A,B)}]` (when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does.

## BUG FIXES

## NOTES
Expand Down
81 changes: 53 additions & 28 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str
}
}
if (isTRUE(stringsAsFactors)) {
for (j in which(vapply(ans, is.character, TRUE))) set(ans, NULL, j, as_factor(.subset2(ans, j)))
for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j)))
# as_factor is internal function in fread.R currently
}
setalloccol(ans) # returns a NAMED==0 object, unlike data.frame()
Expand Down Expand Up @@ -854,29 +854,54 @@ replace_dot_alias = function(e) {
}

jvnames = NULL
if (is.name(jsub)) {
# j is a single unquoted column name
if (jsub!=".SD") {
jvnames = gsub("^[.](N|I|GRP|BY)$","\\1",as.character(jsub))
# jsub is list()ed after it's eval'd inside dogroups.
}
} else if (is.call(jsub) && as.character(jsub[[1L]])[[1L]] %chin% c("list",".")) {
jsub[[1L]] = quote(list)
jsubl = as.list.default(jsub) # TO DO: names(jsub) and names(jsub)="" seem to work so make use of that
if (length(jsubl)>1L) {
jvnames = names(jsubl)[-1L] # check list(a=sum(v),v)
if (is.null(jvnames)) jvnames = rep.int("", length(jsubl)-1L)
for (jj in seq.int(2L,length(jsubl))) {
if (jvnames[jj-1L] == "" && mode(jsubl[[jj]])=="name") {
if (jsubl[[jj]]=="") stop("Item ", jj-1L, " of the .() or list() passed to j is missing") #3507
jvnames[jj-1L] = gsub("^[.](N|I|GRP|BY)$", "\\1", deparse(jsubl[[jj]]))
drop_dot = function(x) {
tt = x %chin% c(".N",".I",".GRP",".BY")
if (any(tt)) x[tt] = substring(x[tt], 2L)
x
}
# handle auto-naming of last item of j (e.g. within {} or if/else, #2478)
# e.g. DT[, .(a=sum(v), v, .N), by=] should create columns named a, v, N
do_j_names = function(q) {
if (!is.call(q) || !is.name(q[[1L]])) return(q)
if (as.character(q[[1L]]) %chin% c('list', '.')) {
q[[1L]] = quote(list)
qlen = length(q)
if (qlen>1L) {
nm = names(q[-1L]) # check list(a=sum(v),v)
if (is.null(nm)) nm = rep.int("", qlen-1L)
# attempt to auto-name unnamed columns
idx = which(!nzchar(nm))
for (jj in idx) {
thisq = q[[jj + 1L]]
if (missing(thisq)) stop("Item ", jj, " of the .() or list() passed to j is missing") #3507
if (is.name(thisq)) nm[jj] = as.character(thisq)
# TO DO: if call to a[1] for example, then call it 'a' too
}
# TO DO: if call to a[1] for example, then call it 'a' too
nm[idx] = drop_dot(nm[idx])
if (!is.null(jvnames) && any(idx <- nm != jvnames))
warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE)
jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better
setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result.
}
setattr(jsubl, "names", NULL) # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result.
jsub = as.call(jsubl)
} # else empty list is needed for test 468: adding an empty list column
} # else maybe a call to transform or something which returns a list.
return(q) # else empty list is needed for test 468: adding an empty list column
}
if (q[[1L]] == '{') {
q[[length(q)]] = do_j_names(q[[length(q)]])
return(q)
}
if (q[[1L]] == 'if') {
#explicit NULL would return NULL, assigning NULL would delete that from the expression
if (!is.null(q[[3L]])) q[[3L]] = do_j_names(q[[3L]])
if (length(q) == 4L && !is.null(q[[4L]])) q[[4L]] = do_j_names(q[[4L]])
return(q)
}
return(q)
}
if (is.name(jsub)) {
# j is a single unquoted column name
if (jsub!=".SD") jvnames = drop_dot(as.character(jsub))
# jsub is list()ed after it's eval'd inside dogroups.
} else jsub = do_j_names(jsub) # else maybe a call to transform or something which returns a list.
av = all.vars(jsub,TRUE) # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c)
use.I = ".I" %chin% av
if (any(c(".SD","eval","get","mget") %chin% av)) {
Expand Down Expand Up @@ -1441,7 +1466,7 @@ replace_dot_alias = function(e) {
lockBinding(".iSD",SDenv)

GForce = FALSE
if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[[1L]] %chin% c(".SD",".N"))) ) { # Ability to turn off if problems or to benchmark the benefit
if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[1L] %chin% c(".SD", ".N"))) ) { # Ability to turn off if problems or to benchmark the benefit
# Optimization to reduce overhead of calling lapply over and over for each group
oldjsub = jsub
funi = 1L # Fix for #985
Expand Down Expand Up @@ -1600,7 +1625,7 @@ replace_dot_alias = function(e) {
if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) {
if (!length(ansvars) && !use.I) {
GForce = FALSE
if ( (is.name(jsub) && jsub == ".N") || (is.call(jsub) && length(jsub)==2L && length(as.character(jsub[[1L]])) && as.character(jsub[[1L]])[1L] == "list" && length(as.character(jsub[[2L]])) && as.character(jsub[[2L]])[1L] == ".N") ) {
if ( (is.name(jsub) && jsub == ".N") || (is.call(jsub) && length(jsub)==2L && jsub[[1L]]== "list" && jsub[[2L]] == ".N") ) {
GForce = TRUE
if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="")
}
Expand Down Expand Up @@ -2293,7 +2318,7 @@ copy = function(x) {
if (!is.data.table(x)) {
# fix for #1476. TODO: find if a cleaner fix is possible..
if (is.list(x)) {
anydt = vapply(x, is.data.table, TRUE, USE.NAMES=FALSE)
anydt = vapply_1b(x, is.data.table, use.names=FALSE)
if (sum(anydt)) {
newx[anydt] = lapply(newx[anydt], function(x) {
.Call(C_unlock, x)
Expand Down Expand Up @@ -2591,7 +2616,7 @@ setDF = function(x, rownames=NULL) {
}
x
} else {
n = vapply(x, length, 0L)
n = vapply_1i(x, length)
mn = max(n)
if (any(n<mn))
stop("All elements in argument 'x' to 'setDF' must be of same length")
Expand Down Expand Up @@ -2671,7 +2696,7 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
# fail for NULL columns will give helpful error at that point, #3480 and #3471
if (inherits(x[[i]], "POSIXlt")) stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.")
}
n = vapply(x, length, 0L)
n = vapply_1i(x, length)
n_range = range(n)
if (n_range[1L] != n_range[2L]) {
tbl = sort(table(n))
Expand Down Expand Up @@ -2892,7 +2917,7 @@ isReallyReal = function(x) {
}
if (length(i) == 0L) stop("Internal error in .isFastSubsettable. Please report to data.table developers") # nocov
## convert i to data.table with all combinations in rows.
if(length(i) > 1L && prod(vapply(i, length, integer(1L))) > 1e4){
if(length(i) > 1L && prod(vapply_1i(i, length)) > 1e4){
## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635
if (verbose) {cat("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()}
return(NULL)
Expand Down
4 changes: 2 additions & 2 deletions R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,9 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir())
if (stringsAsFactors) {
if (is.double(stringsAsFactors)) { #2025
should_be_factor = function(v) is.character(v) && uniqueN(v) < nr * stringsAsFactors
cols_to_factor = which(vapply(ans, should_be_factor, logical(1L)))
cols_to_factor = which(vapply_1b(ans, should_be_factor))
} else {
cols_to_factor = which(vapply(ans, is.character, logical(1L)))
cols_to_factor = which(vapply_1b(ans, is.character))
}
if (verbose) cat("stringsAsFactors=", stringsAsFactors, " converted ", length(cols_to_factor), " column(s): ", brackify(names(ans)[cols_to_factor]), "\n", sep="")
for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j)))
Expand Down
2 changes: 1 addition & 1 deletion R/groupingsets.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...)
setcolorder(empty, c("grouping", by, setdiff(names(empty), c("grouping", by))))
}
# workaround for rbindlist fill=TRUE on integer64 #1459
int64.cols = vapply(empty, inherits, logical(1L), "integer64")
int64.cols = vapply_1b(empty, inherits, "integer64")
int64.cols = names(int64.cols)[int64.cols]
if (length(int64.cols) && !requireNamespace("bit64", quietly=TRUE))
stop("Using integer64 class columns require to have 'bit64' package installed.") # nocov
Expand Down
2 changes: 1 addition & 1 deletion R/print.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
factor = "<fctr>", POSIXct = "<POSc>", logical = "<lgcl>",
IDate = "<IDat>", integer64 = "<i64>", raw = "<raw>",
expression = "<expr>", ordered = "<ord>")
classes = vapply(x, function(col) class(col)[1L], "", USE.NAMES=FALSE)
classes = vapply_1c(x, function(col) class(col)[1L], use.names=FALSE)
abbs = unname(class_abb[classes])
if ( length(idx <- which(is.na(abbs))) ) abbs[idx] = paste0("<", classes[idx], ">")
toprint = rbind(abbs, toprint)
Expand Down
5 changes: 2 additions & 3 deletions R/setops.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ funique = function(x) {
if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have the same column names")
if (!identical(names(x), names(y))) stop("x and y must have the same column order")
bad_types = c("raw", "complex", if (block_list) "list")
found = bad_types %chin% c(vapply(x, typeof, FUN.VALUE = ""),
vapply(y, typeof, FUN.VALUE = ""))
found = bad_types %chin% c(vapply_1c(x, typeof), vapply_1c(y, typeof))
if (any(found)) stop("unsupported column type", if (sum(found) > 1L) "s" else "",
" found in x or y: ", brackify(bad_types[found]))
super = function(x) {
Expand Down Expand Up @@ -176,7 +175,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu
if (ignore.row.order) {
if (".seqn" %chin% names(target))
stop("None of the datasets to compare should contain a column named '.seqn'")
bad.type = setNames(c("raw","complex","list") %chin% c(vapply(current, typeof, FUN.VALUE = ""), vapply(target, typeof, FUN.VALUE = "")), c("raw","complex","list"))
bad.type = setNames(c("raw","complex","list") %chin% c(vapply_1c(current, typeof), vapply_1c(target, typeof)), c("raw","complex","list"))
if (any(bad.type))
stop("Datasets to compare with 'ignore.row.order' must not have unsupported column types: ", brackify(names(bad.type)[bad.type]))
if (between(tolerance, 0, sqrt(.Machine$double.eps), incbounds=FALSE)) {
Expand Down
11 changes: 9 additions & 2 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -10271,11 +10271,11 @@ test(1725, DT, data.table(a=1:3, add0=NA_real_, add1=c(NA,NA,1.1), add2=c(NA,NA,
DT = data.table(grp=rep(3:1,each=3), val=1:9)
lastGrp = 0L
test(1726.1, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, keyby=grp],
data.table(grp=1:3, V1=c(8,12,6), V2=1:3, key="grp") )
data.table(grp=1:3, ans=c(8,12,6), GRP=1:3, key="grp") )
test(1726.2, lastGrp, 1L)
lastGrp = -1L
test(1726.3, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, by=grp],
data.table(grp=3:1, V1=c(1,6,12), V2=1:3) )
data.table(grp=3:1, ans=c(1,6,12), GRP=1:3) )
test(1726.4, lastGrp, 7L)
rm(lastGrp)

Expand Down Expand Up @@ -16370,6 +16370,13 @@ test(2120.07, iDT[(i_id), order(e_date, e_time)], c(3L,4L,1L,2L)) # wrapping wi
test(2120.08, tmp[iDT[(i_id), order(e_date, e_time)]], # different result with the NA
data.table(i_id=c("A",NA,"B","C"), N=c(5L,NA,5L,5L)))

# auto-name .() when it's the last item of {...} or wrapped with if(), #2478 #609
DT = data.table(a = c(1, 1, 2), b = 4:6)
test(2121.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)])
test(2121.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)])
test(2121.3, DT[ , if (.N > 1L) .(b), by=a], DT[1:2])
test(2121.4, DT[ , if (.N > 1L) .(b) else .(c=b), by=a], DT[ , .(a, c=b)],
warning="Different branches of j expression produced different auto-named columns")

###################################
# Add new tests above this line #
Expand Down

0 comments on commit 883a136

Please sign in to comment.