From 04eeb304e95f889e9ea4087af741f938b12b26d8 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 13 Jun 2019 20:04:04 -0500 Subject: [PATCH] as.data.table.list centralized (#3471) --- NEWS.md | 22 +++++- R/as.data.table.R | 106 ++++++++++++++++--------- R/data.table.R | 180 ++++++++++-------------------------------- R/setkey.R | 18 ++--- R/utils.R | 12 +-- inst/tests/tests.Rraw | 90 ++++++++++++++++----- src/init.c | 2 - src/wrappers.c | 23 +----- 8 files changed, 213 insertions(+), 240 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9e8c5a76c..0daf8b482 100644 --- a/NEWS.md +++ b/NEWS.md @@ -92,6 +92,8 @@ 15. `DT[order(col)[1:5], ...]` (i.e. where `i` is a compound expression involving `order()`) is now optimized to use `data.table`'s multithreaded `forder`, [#1921](https://github.com/Rdatatable/data.table/issues/1921). This example is not a fully optimal top-N query since the full ordering is still computed. The improvement is that the call to `order()` is computed faster for any `i` expression using `order`. +16. `as.data.table` now unpacks columns in a `data.frame` which are themselves a `data.frame`. This need arises when parsing JSON, a corollary in [#3369](https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752). `data.table` does not allow columns to be objects which themselves have columns (such as `matrix` and `data.frame`), unlike `data.frame` which does. Bug fix 19 in v1.12.2 (see below) added a helpful error (rather than segfault) to detect such invalid `data.table`, and promised that `as.data.table()` would unpack these columns in the next release (i.e. this release) so that the invalid `data.table` is not created in the first place. + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. @@ -134,7 +136,23 @@ 20. `c`, `seq` and `mean` of `ITime` objects now retain the `ITime` class via new `ITime` methods, [#3628](https://github.com/Rdatatable/data.table/issues/3628). Thanks @UweBlock for reporting. The `cut` and `split` methods for `ITime` have been removed since the default methods work, [#3630](https://github.com/Rdatatable/data.table/pull/3630). -20. `as.data.table.array` now handles the case when some of the array's dimension names are `NULL`, [#3636](https://github.com/Rdatatable/data.table/issues/3636). +21. `as.data.table.array` now handles the case when some of the array's dimension names are `NULL`, [#3636](https://github.com/Rdatatable/data.table/issues/3636). + +22. Adding a `list` column using `cbind`, `as.data.table`, or `data.table` now works rather than treating the `list` as if it were a set of columns, [#3471](https://github.com/Rdatatable/data.table/pull/3471). However, please note that using `:=` to add columns is preferred. + + ```R + # cbind( data.table(1:2), list(c("a","b"),"a") ) + # V1 V2 NA # v1.12.2 and before + # # introduced invalid NA column name too + # 1: 1 a a + # 2: 2 b a + # + # V1 V2 # v1.12.4+ + # + # 1: 1 a,b + # 2: 2 a + ``` + #### NOTES @@ -231,7 +249,7 @@ 18. `cbind` with a null (0-column) `data.table` now works as expected, [#3445](https://github.com/Rdatatable/data.table/issues/3445). Thanks to @mb706 for reporting. -19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object such as a data.frame or matrix which have columns. Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are data.frame to support this use case. +19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object which has columns (such as a `data.frame` or `matrix`). Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are `data.frame` to support this use case. #### NOTES diff --git a/R/as.data.table.R b/R/as.data.table.R index 8b87a9bc2..8aac617e9 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -115,49 +115,75 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va ans[] } -as.data.table.list = function(x, keep.rownames=FALSE, key=NULL, ...) { - wn = sapply(x,is.null) - if (any(wn)) x = x[!wn] - if (!length(x)) return( null.data.table() ) - # fix for #833, as.data.table.list with matrix/data.frame/data.table as a list element.. - # TODO: move this entire logic (along with data.table() to C - for (i in seq_along(x)) { - dims = dim(x[[i]]) - if (!is.null(dims)) { - ans = do.call("data.table", x) - setnames(ans, make.unique(names(ans))) - return(ans) +as.data.table.list = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE, ...) { + n = length(x) + eachnrow = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required. + eachncol = integer(n) + missing.check.names = missing(check.names) + for (i in seq_len(n)) { + xi = x[[i]] + if (is.null(xi)) next # eachncol already initialized to 0 by integer() above + if (!is.null(dim(xi)) && missing.check.names) check.names=TRUE + if ("POSIXlt" %chin% class(xi)) { + warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") + xi = x[[i]] = as.POSIXct(xi) + } else if (is.matrix(xi) || is.data.frame(xi)) { + if (!is.data.table(xi)) { + xi = x[[i]] = as.data.table(xi, keep.rownames=keep.rownames) # we will never allow a matrix to be a column; always unpack the columns + } + # else avoid dispatching to as.data.table.data.table (which exists and copies) + } else if (is.table(xi)) { + xi = x[[i]] = as.data.table.table(xi, keep.rownames=keep.rownames) + } else if (is.function(xi)) { + xi = x[[i]] = list(xi) } + eachnrow[i] = NROW(xi) # for a vector (including list() columns) returns the length + eachncol[i] = NCOL(xi) # for a vector returns 1 } - n = vapply(x, length, 0L) - mn = max(n) - x = copy(x) - idx = which(n < mn) - if (length(idx)) { - for (i in idx) { - # any is.null(x[[i]]) were removed above, otherwise warning when a list element is NULL - if (inherits(x[[i]], "POSIXlt")) { - warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") - x[[i]] = as.POSIXct(x[[i]]) - } - # Implementing FR #4813 - recycle with warning when nr %% nrows[i] != 0L - if (!n[i] && mn) - warning("Item ", i, " is of size 0 but maximum size is ", mn, ", therefore recycled with 'NA'") - else if (n[i] && mn %% n[i] != 0L) - warning("Item ", i, " is of size ", n[i], " but maximum size is ", mn, " (recycled leaving a remainder of ", mn%%n[i], " items)") - x[[i]] = rep(x[[i]], length.out=mn) + ncol = sum(eachncol) # hence removes NULL items silently (no error or warning), #842. + if (ncol==0L) return(null.data.table()) + nrow = max(eachnrow) + ans = vector("list",ncol) # always return a new VECSXP + recycle = function(x, nrow) { + if (length(x)==nrow) { + return(copy(x)) + # This copy used to be achieved via .Call(CcopyNamedInList,x) at the top of data.table(). It maintains pre-Rv3.1.0 + # behavior, for now. See test 548.2. The copy() calls duplicate() at C level which (importantly) also expands ALTREP objects. + # TODO: port this as.data.table.list() to C and use MAYBE_REFERENCED(x) || ALTREP(x) to save some copies. + # That saving used to be done by CcopyNamedInList but the copies happened again as well, so removing CcopyNamedInList is + # not worse than before, and gets us in a better centralized place to port as.data.table.list to C and use MAYBE_REFERENCED + # again in future. } + if (identical(x,list())) vector("list", nrow) else rep(x, length.out=nrow) # new objects don't need copy } - # fix for #842 - if (mn > 0L) { - nz = which(n > 0L) - xx = point(vector("list", length(nz)), seq_along(nz), x, nz) - if (!is.null(names(x))) - setattr(xx, 'names', names(x)[nz]) - x = xx + vnames = character(ncol) + k = 1L + for(i in seq_len(n)) { + xi = x[[i]] + if (is.null(xi)) next + if (eachnrow[i]>1L && nrow%%eachnrow[i]!=0L) # in future: eachnrow[i]!=nrow + warning("Item ", i, " has ", eachnrow[i], " rows but longest item has ", nrow, "; recycled with remainder.") + if (eachnrow[i]==0L && nrow>0L && is.atomic(xi)) # is.atomic to ignore list() since list() is a common way to initialize; let's not insist on list(NULL) + warning("Item ", i, " has 0 rows but longest item has ", nrow, "; filled with NA") # the rep() in recycle() above creates the NA vector + if (is.data.table(xi)) { # matrix and data.frame were coerced to data.table above + # vnames[[i]] = names(xi) #if (nm!="" && n>1L) paste(nm, names(xi), sep=".") else names(xi) + for (j in seq_along(xi)) { + ans[[k]] = recycle(xi[[j]], nrow) + vnames[k] = names(xi)[j] + k = k+1L + } + } else { + nm = names(x)[i] + vnames[k] = if (length(nm) && !is.na(nm) && nm!="") nm else paste0("V",i) + ans[[k]] = recycle(xi, nrow) + k = k+1L + } } - setDT(x, key=key) # copy ensured above; also, setDT handles naming - x + if (any(vnames==".SD")) stop("A column may not be called .SD. That has special meaning.") + if (check.names) vnames = make.names(vnames, unique=TRUE) + setattr(ans, "names", vnames) + setDT(ans, key=key) # copy ensured above; also, setDT handles naming + ans } # don't retain classes before "data.frame" while converting @@ -180,6 +206,10 @@ as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) { setnames(ans, 'rn', keep.rownames[1L]) return(ans) } + if (any(!sapply(x,is.atomic))) { + # a data.frame with a column that is data.frame needs to be expanded; test 2013.4 + return(as.data.table.list(x, keep.rownames=keep.rownames, ...)) + } ans = copy(x) # TO DO: change this deep copy to be shallow. setattr(ans, "row.names", .set_row_names(nrow(x))) diff --git a/R/data.table.R b/R/data.table.R index 113d8bc7d..fc80068eb 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -47,137 +47,39 @@ null.data.table = function() { data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE) { - # NOTE: It may be faster in some circumstances to create a data.table by creating a list l first, and then setattr(l,"class",c("data.table","data.frame")) at the expense of checking. - # TO DO: rewrite data.table(), one of the oldest functions here. Many people use data.table() to convert data.frame rather than - # as.data.table which is faster; speed could be better. Revisit how many copies are taken in for example data.table(DT1,DT2) which - # cbind directs to. And the nested loops for recycling lend themselves to being C level. - - x = list(...) # doesn't copy named inputs as from R >= 3.1.0 (a very welcome change) - .Call(CcopyNamedInList,x) # to maintain pre-Rv3.1.0 behaviour, for now. See test 548.2. TODO: revist - # TODO Something strange with NAMED on components of `...` to investigate. Or, just port data.table() to C. - - if (length(x) < 1L) - return( null.data.table() ) - # fix for #5377 - data.table(null list, data.frame and data.table) should return null data.table. Simple fix: check all scenarios here at the top. - if (identical(x, list(NULL)) || identical(x, list(list())) || - identical(x, list(data.frame(NULL))) || identical(x, list(data.table(NULL)))) return( null.data.table() ) - nd = name_dots(...) - myNCOL = function(x) if (is.null(x)) 0L else NCOL(x) # tmp fix (since NCOL(NULL)==1) until PR#3471 goes ahead in v1.12.4 - if (any(nocols<-sapply(x, myNCOL)==0L)) { tt=!nocols; x=x[tt]; nd=lapply(nd,'[',tt); } # data.table(data.table(), data.table(a=integer())), #3445 - vnames = nd$vnames - novname = nd$novname # novname used later to know which were explicitly supplied in the call - n = length(x) - if (length(vnames) != n) stop("logical error in vnames") # nocov - # cast to a list to facilitate naming of columns with dimension -- - # unlist() at the end automatically handles the need to "push" names - # to accommodate the "new" columns - vnames = as.list.default(vnames) - nrows = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required. - numcols = integer(n) # the ncols of each of the inputs (e.g. if inputs contain matrix or data.table) - for (i in seq_len(n)) { - xi = x[[i]] - if (is.null(xi)) stop("Internal error: NULL item ", i," should have been removed from list above") # nocov - if ("POSIXlt" %chin% class(xi)) { - warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date. Use as.POSIXct to avoid this warning.") - x[[i]] = as.POSIXct(xi) - } else if (is.matrix(xi) || is.data.frame(xi)) { # including data.table (a data.frame, too) - xi = as.data.table(xi, keep.rownames=keep.rownames) # TO DO: allow a matrix to be a column of a data.table. This could allow a key'd lookup to a matrix, not just by a single rowname vector, but by a combination of several columns. A matrix column could be stored either by row or by column contiguous in memory. - x[[i]] = xi - numcols[i] = length(xi) - } else if (is.table(xi)) { - x[[i]] = xi = as.data.table.table(xi, keep.rownames=keep.rownames) - numcols[i] = length(xi) - } else if (is.function(xi)) { - x[[i]] = xi = list(xi) - } - nrows[i] = NROW(xi) # for a vector (including list() columns) returns the length - if (numcols[i]>0L) { - namesi = names(xi) # works for both data.frame's, matrices and data.tables's - if (length(namesi)==0L) namesi = rep.int("",ncol(xi)) - namesi[is.na(namesi)] = "" - tt = namesi=="" - if (any(tt)) namesi[tt] = paste0("V", which(tt)) - if (novname[i]) vnames[[i]] = namesi - else vnames[[i]] = paste(vnames[[i]], namesi, sep=".") - } - } - nr = max(nrows) - ckey = NULL - recycledkey = FALSE - for (i in seq_len(n)) { - xi = x[[i]] - if (is.data.table(xi) && haskey(xi)) { - if (nrows[i] 1L) "s", " in the longest column. Or, all columns can be 0 length, for insert()ing rows into.") - # Implementing FR #4813 - recycle with warning when nr %% nrows[i] != 0L - if (nr%%nrows[i] != 0L) warning("Item ", i, " is of size ", nrows[i], " but maximum size is ", nr, " (recycled leaving remainder of ", nr%%nrows[i], " items)") - if (is.data.frame(xi)) { # including data.table - ..i = rep(seq_len(nrow(xi)), length.out = nr) - x[[i]] = xi[..i,,drop=FALSE] - next - } - if (is.atomic(xi) || is.list(xi)) { - # TO DO: surely use set() here, or avoid the coercion - x[[i]] = rep(xi, length.out = nr) - next - } - stop("problem recycling column ",i,", try a simpler type") - } - if (any(numcols>0L)) { - value = vector("list",sum(pmax(numcols,1L))) - k = 1L - for(i in seq_len(n)) { - if (is.list(x[[i]]) && !is.ff(x[[i]])) { - for(j in seq_len(length(x[[i]]))) { - value[[k]] = x[[i]][[j]] - k=k+1L - } - } else { - value[[k]] = x[[i]] - k=k+1L - } - } - } else { - value = x - } - vnames = unlist(vnames) - if (check.names) # default FALSE - vnames = make.names(vnames, unique = TRUE) - setattr(value,"names",vnames) - setattr(value,"row.names",.set_row_names(nr)) - setattr(value,"class",c("data.table","data.frame")) + # NOTE: It may be faster in some circumstances for users to create a data.table by creating a list l + # first, and then setattr(l,"class",c("data.table","data.frame")) and forgo checking. + x = list(...) # list() doesn't copy named inputs as from R >= 3.1.0 (a very welcome change) + names(x) = name_dots(...) + if (length(x)==0L) return( null.data.table() ) + if (length(x)==1L && (is.null(x[[1L]]) || (is.list(x[[1L]]) && length(x[[1L]])==0L))) return( null.data.table() ) #5377 + ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names) # see comments inside as.data.table.list re copies if (!is.null(key)) { if (!is.character(key)) stop("key argument of data.table() must be character") if (length(key)==1L) { key = strsplit(key,split=",")[[1L]] # eg key="A,B"; a syntax only useful in key argument to data.table(), really. } - setkeyv(value,key) + setkeyv(ans,key) } else { - # retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table(). - # If DT inputs with keys have been recycled then can't retain key - if (length(ckey) - && !recycledkey - && !any(duplicated(ckey)) - && all(ckey %chin% names(value)) - && !any(duplicated(names(value)[names(value) %chin% ckey]))) - setattr(value, "sorted", ckey) + # retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table(). + # If DT inputs with keys have been recycled then can't retain key + ckey = NULL + for (i in seq_along(x)) { + xi = x[[i]] + if (is.data.table(xi) && haskey(xi) && nrow(xi)==nrow(ans)) ckey=c(ckey, key(xi)) + } + if (length(ckey) && + !anyDuplicated(ckey) && + identical(is.na(chmatchdup(c(ckey,ckey), names(ans))), rep(c(FALSE,TRUE),each=length(ckey)))) { + setattr(ans, "sorted", ckey) + } } if (isTRUE(stringsAsFactors)) { - for (j in which(vapply(value, is.character, TRUE))) set(value, NULL, j, as_factor(.subset2(value, j))) + for (j in which(vapply(ans, is.character, TRUE))) set(ans, NULL, j, as_factor(.subset2(ans, j))) # as_factor is internal function in fread.R currently } - alloc.col(value) # returns a NAMED==0 object, unlike data.frame() + alloc.col(ans) # returns a NAMED==0 object, unlike data.frame() } replace_dot_alias = function(e) { @@ -1399,12 +1301,15 @@ replace_order = function(isub, verbose, env) { jval = data.table(jval) # TO DO: should this be setDT(list(jval)) instead? } else { if (is.null(jvnames)) jvnames=names(jval) - # avoid copy if all vectors are already of same lengths, use setDT lenjval = vapply(jval, length, 0L) - if (any(lenjval != lenjval[1L])) { - jval = as.data.table.list(jval) # does the vector expansion to create equal length vectors - jvnames = jvnames[lenjval != 0L] # fix for #1477 - } else setDT(jval) + nulljval = vapply(jval, is.null, FALSE) + if (lenjval[1L]==0L || any(lenjval != lenjval[1L])) { + jval = as.data.table.list(jval) # does the vector expansion to create equal length vectors, and drops any NULL items + jvnames = jvnames[!nulljval] # fix for #1477 + } else { + # all columns same length and at least 1 row; avoid copy. TODO: remove when as.data.table.list is ported to C + setDT(jval) + } } if (is.null(jvnames)) jvnames = character(length(jval)-length(bynames)) ww = which(jvnames=="") @@ -1420,9 +1325,7 @@ replace_order = function(isub, verbose, env) { setattr(jval, 'class', class(x)) # fix for #5296 if (haskey(x) && all(key(x) %chin% names(jval)) && suppressWarnings(is.sorted(jval, by=key(x)))) # TO DO: perhaps this usage of is.sorted should be allowed internally then (tidy up and make efficient) setattr(jval, 'sorted', key(x)) - # postponed to v1.12.4 because package eplusr creates a NULL column and then runs setcolorder on the result which fails if there are fewer columns - # w = sapply(jval, is.null) - # if (any(w)) jval = jval[,!w,with=FALSE] # no !..w due to 'Undefined global functions or variables' note from R CMD check + if (any(sapply(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov } return(jval) } @@ -2196,7 +2099,6 @@ within.data.table = function (data, expr, ...) ans } - transform.data.table = function (`_data`, ...) # basically transform.data.frame with data.table instead of data.frame, and retains key { @@ -2206,9 +2108,11 @@ transform.data.table = function (`_data`, ...) inx = chmatch(tags, names(`_data`)) matched = !is.na(inx) if (any(matched)) { - if (isTRUE(attr(`_data`, ".data.table.locked", TRUE))) setattr(`_data`, ".data.table.locked", NULL) # fix for #1641 + if (isTRUE(attr(`_data`, ".data.table.locked", TRUE))) { + setattr(`_data`, ".data.table.locked", NULL) # fix for #1641, now covered by test 104.2 + } `_data`[,inx[matched]] = e[matched] - `_data` = data.table(`_data`) + `_data` = as.data.table(`_data`) } if (!all(matched)) { ans = do.call("data.table", c(list(`_data`), e[!matched])) @@ -2788,18 +2692,18 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { } else if (is.list(x)) { # copied from as.data.table.list - except removed the copy for (i in seq_along(x)) { - if (inherits(x[[i]], "POSIXlt")) - stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") + if (is.null(x[[i]])) next # allow NULL columns to be created by setDT(list) even though they are not really allowed + # many operations still work in the presence of NULL columns and it might be convenient + # e.g. in package eplusr which calls setDT on a list when parsing JSON. Operations which + # fail for NULL columns will give helpful error at that point, #3480 and #3471 + if (inherits(x[[i]], "POSIXlt")) stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") } n = vapply(x, length, 0L) n_range = range(n) if (n_range[1L] != n_range[2L]) { tbl = sort(table(n)) - stop("All elements in argument 'x' to 'setDT' must be of same length, ", - "but the profile of input lengths (length:frequency) is: ", - brackify(sprintf('%s:%d', names(tbl), tbl)), - "\nThe first entry with fewer than ", n_range[2L], - " entries is ", which.max(n dt)) @@ -2633,8 +2635,8 @@ test(927, DT[, if(a==2L) list(42:43,numeric()) else list(42L,3.14), by=a], data. test(928, cbind(data.table(a=1L),b=1:3), data.table(a=1L,b=1:3)) # FR #4813 implementation resulted in changing 929 error to warning # test(929, cbind(data.table(a=1L,b=2:3),c=1:3), error="argument 1 (nrow 2) cannot be recycled without remainder to match longest nrow (3)") -test(929, cbind(data.table(a=1L,b=2:3),c=1:3), data.table(a=1L, b=c(2L,3L,2L), c=1:3), warning="Item 1 is of size 2 but maximum size is 3") -test(930, cbind(data.table(a=1L,b=2:3),c=1:4), data.table(a=1L,b=INT(2,3,2,3),c=1:4)) +test(929, cbind(data.table(a=1L,b=2:3),c=1:3), data.table(a=1L, b=c(2L,3L,2L), c=1:3), warning="Item 1 has 2 rows but longest item has 3; recycled.") +test(930, cbind(data.table(a=1L,b=2:3),c=1:4), data.table(a=1L,b=INT(2,3,2,3),c=1:4)) # TODO: warning in future DT = data.table(x=c(1,1,1,1,2,2,3),y=c(1,1,2,3,1,1,2)) DT[,rep:=1L][c(2,7),rep:=c(2L,3L)] # duplicate row 2 and triple row 7 DT[,num:=1:.N] # to group each row by itself @@ -3710,10 +3712,10 @@ test(1139, capture.output(print(DT)), c(" x y", "1: a ~ b 1", "2: # FR #4813 - provide warnings if there are remainders for both as.data.table.list(.) and data.table(.) X = list(a = 1:2, b = 1:3) -test(1140, as.data.table(X), data.table(a=c(1:2,1L), b=c(1:3)), warning="Item 1 is of size 2 but maximum") -test(1141.1, data.table(a=1:2, b=1:3), data.table(a=c(1L,2L,1L), b=1:3), warning="Item 1 is of size 2 but maximum") -test(1141.2, data.table(a=1:2, data.table(x=1:5, y=6:10)), data.table(a=c(1L,2L,1L,2L,1L), x=1:5, y=6:10), warning="Item 1 is of size 2 but maximum") -test(1141.3, data.table(a=1:5, data.table(x=c(1,2), y=c(3,4))), data.table(a=c(1:5), x=c(1,2,1,2,1), y=c(3,4,3,4,3)), warning="Item 2 is of size 2 but maximum") +test(1140, as.data.table(X), data.table(a=c(1:2,1L), b=c(1:3)), warning="Item 1 has 2 rows but longest item has 3; recycled") +test(1141.1, data.table(a=1:2, b=1:3), data.table(a=c(1L,2L,1L), b=1:3), warning="Item 1 has 2 rows but longest item has 3; recycled") +test(1141.2, data.table(a=1:2, data.table(x=1:5, y=6:10)), data.table(a=c(1L,2L,1L,2L,1L), x=1:5, y=6:10), warning="Item 1 has 2 rows but longest item has 5; recycled") +test(1141.3, data.table(a=1:5, data.table(x=c(1,2), y=c(3,4))), data.table(a=c(1:5), x=c(1,2,1,2,1), y=c(3,4,3,4,3)), warning="Item 2 has 2 rows but longest item has 5; recycled") # Fix for bug #5098 - DT[, foo()] returns function definition. DT <- data.table(a=1:2) @@ -5729,7 +5731,7 @@ test(1380, DT[a==TRUE], DT[3:4]) # Fix #847, as.data.table.list and character(0) issue x <- data.table(a=character(0), b=character(0), c=numeric(0)) setkey(x, a, b) -test(1381, x[J("foo", character(0)), nomatch=0L], x, warning="Item 2 is of size 0 but maximum size is 1,") +test(1381, x[J("foo", character(0)), nomatch=0L], x, warning="Item 2 has 0 rows but longest item has 1; filled with NA") # Fix for #813 and #758 DT = data.table(x = 1:2) @@ -6743,14 +6745,18 @@ test(1483.3, merge(x,y,by="country",all=TRUE), data.table(country=factor(c("US", setkey(y) test(1483.4, y[x], data.table(country="US", key="country")) -# Fix for #842 +# NULL items should be removed when making data.table from list, #842 +# Original fix for #842 added a branch in as.data.table.list() using point() +# Then PR#3471 moved logic from data.table() into as.data.table.list() and now removes NULL items up front, so longer need for the branch +# Since the logic was changed, this test was strengthened to explicity test the result rather than compare two calls to SomeFunction() SomeFunction <- function(x, setnull=1L) { ans <- replicate(length(x), list("bla1", "bla2"), simplify=FALSE) ans[setnull] <- list(NULL) return(ans) } DT <- data.table(ID=1:3, key="ID") -test(1484, DT[, SomeFunction(ID, setnull=1L)], DT[, SomeFunction(ID, setnull=2L)]) +test(1484.1, DT[, SomeFunction(ID, setnull=1L)], ans<-data.table(V1=list("bla1","bla2"), V2=list("bla1","bla2"))) +test(1484.2, DT[, SomeFunction(ID, setnull=2L)], ans) # Fix for #868 vals = c("setosa", "versicolor", "virginica") @@ -7096,8 +7102,10 @@ x = c(1, 2, 1) y = c(5, 8, 8, 4) options(datatable.CJ.names=FALSE) test(1525.1, CJ(x, y, unique=TRUE), CJ(V1=c(1,2), V2=c(4,5,8))) +test(1525.2, CJ(x, z=y, unique=TRUE), ans<-data.table(V1=rep(c(1,2), each=3), z=c(4,5,8), key="V1,z")) # naming of one but not both, too options(datatable.CJ.names=TRUE) -test(1525.2, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) +test(1525.3, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) +test(1525.4, CJ(x, z=y, unique=TRUE), setnames(ans,c("x","z"))) # `key` argument fix for `setDT` when input is already a `data.table`, #1169 DT <- data.table(A = 1:4, B = 5:8) @@ -8360,9 +8368,9 @@ test(1613.564, all(is.character(all.equal(rbind(x,y), rbind(y,y), ignore.row.ord test(1613.565, all(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE), is.character(r<-all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0 test(1613.566, all(all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = FALSE, tolerance = 0), all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = TRUE, tolerance = 0))) test(1613.567, all(is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE, tolerance = 0)))) -test(1613.571, all(all.equal(cbind(x, list(factor(1))), cbind(y, list(factor(1))), ignore.row.order = FALSE), is.character(r<-all.equal(cbind(x, list(factor(1))), cbind(y, list(factor(1))), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0 -test(1613.572, all(all.equal(cbind(x, list(factor(1))), cbind(x, list(factor(1))), ignore.row.order = FALSE), all.equal(cbind(x, list(factor(1))), cbind(x, list(factor(1))), ignore.row.order = TRUE))) # x to x with factor equality -test(1613.573, all.equal(cbind(x, list(factor(1))), cbind(x, list(factor(1))), ignore.row.order = TRUE, tolerance = 1), error = "Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") # error due to provided non zero tolerance +test(1613.571, all(all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = FALSE), is.character(r<-all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0 +test(1613.572, all(all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = FALSE), all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE))) # x to x with factor equality +test(1613.573, all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE, tolerance = 1), error = "Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") # error due to provided non zero tolerance test(1613.581, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = 1), all.equal(x, y, ignore.row.order = TRUE, tolerance = 1))) test(1613.582, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = sqrt(.Machine$double.eps)/2), all.equal(x, y, ignore.row.order = TRUE, tolerance = sqrt(.Machine$double.eps)/2)), warning = "Argument 'tolerance' was forced") @@ -11991,7 +11999,7 @@ for (i in 100:1) { # in the process of PR #2573 ## data.table cannot recycle complicated types short_s4_col = getClass("MethodDefinition") -test(1872.01, data.table(a = 1:4, short_s4_col), error = 'problem recycling.*try a simpler type') +test(1872.01, data.table(a = 1:4, short_s4_col), error="attempt to replicate an object of type 'S4'") ## i must be a data.table when on is specified DT = data.table(a = 1:3) test(1872.02, DT[c(TRUE, FALSE), on = 'coefficients'], error = "not a data.table, but 'on'") @@ -13444,7 +13452,7 @@ test(1967.34, data.table(1:5, NULL), data.table(V1=1:5)) ### if (novname[i]) vnames[[i]] = namesi ### but, on pause for now pending #3193 ### test(1967.35, data.table(1:5, matrix(6:15, nrow = 5L)) -test(1967.35, data.table(1:5, integer(0L)), error = 'Item 2 has no length') +test(1967.35, data.table(1:5, integer(0L)), data.table(1:5, NA_integer_), warning="Item 2 has 0 rows but longest item has 5; filled with NA") test(1967.36, data.table(1:5, key = 5L), error = 'must be character') x = data.table(a = 1:5) @@ -14068,10 +14076,9 @@ DT = structure(list(NULL), names="a", class=c("data.table","data.frame")) test(2009.1, DT[a>1], error="Column 1 is NULL; malformed data.table") DT = null.data.table() x = NULL -test(2009.2, DT[, .(x)], ans<-alloc.col(structure(list(x=NULL), names="x", class=c("data.table","data.frame")))) -# postponed to 1.12.4 ... V1=null.data.table()) # because .(x) evaluated to NULL; NULL columns in results removed +test(2009.2, DT[, .(x)], null.data.table()) # because .(x) evaluated to .(NULL); NULL columns in results removed DT = data.table(A=1:3) -test(2009.3, DT[, .(x)], ans) # null.data.table() +test(2009.3, DT[, .(x)], null.data.table()) test(2009.4, DT[, .(x, sum(A))], data.table(V1=6L)) test(2009.5, DT[, .(sum(A), x)], data.table(V1=6L)) test(2009.6, data.table(character(0), NULL), data.table(V1=character())) @@ -14131,7 +14138,7 @@ test(2012.3, data.table(data.frame(), data.frame(a=integer())), data.table(a=int dt = as.data.table(iris) test(2012.4, cbind(data.table(), dt[0]), dt[0]) -# extra validity checks in subsetDT on data.table; #3369 +# extra validity checks in subsetDT on data.table; aside in #3369 DT = structure(list(a=1:3, b=NULL, c=4:6), class=c("data.table","data.frame")) test(2013.1, DT[2], error="Column 2 is NULL; malformed data.table") DT = structure(list(a=1:3, b=data.frame(foo=10:12,bar=13:15), c=4:6), class=c("data.table","data.frame")) @@ -14991,6 +14998,49 @@ ans = data.table(V1=c(1L,1L,1L,1L,2L,2L,2L,2L), key=c("V1","V2","V3")) test(2056, as.data.table(a), ans) +# unpack columns which are data.frame; aside in #3369 too. +DF = structure(list(a=1:3, b=data.frame(foo=4:6, bar=7:9)), row.names=1:3, class="data.frame") +DT = as.data.table(DF) +test(2057.1, ncol(DT), 3L) +test(2057.2, DT[2], data.table(a=2L, foo=5L, bar=8L)) +DF = structure(list(a=list(c("a","b"), c("a","b"), c("a","b")), b=data.frame(foo=1:3, bar=4:6)), row.names=1:3, class="data.frame") +# A list being first is needed to mimic the jsonlite case. With this passing, test.R works from https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752 +DT = as.data.table(DF) +test(2057.3, DT, data.table(a=list(c("a","b"), c("a","b"), c("a","b")), foo=1:3, bar=4:6)) + +# one and two+ row cases of data.table, as.data.table and cbind involving list columns, given +# the change to tests 1613.571-3 in PR#3471 in v1.12.4 +# in v1.12.2 and before : +# data.table( data.table(1:2), list(c("a","b"),"a") ) +# V1 V2 NA +# +# 1: 1 a a +# 2: 2 b a +# i.e. passing a data.table() to data.table() changed the meaning of list() which was inconsistent, +# and an NA column name was introduced too (a bug in itself) +# from v1.12.4 : +# V1 V2 +# +# 1: 1 a,b +# 2: 2 a +# i.e. now easier to add the list column as intended, and it's consistent with +# basic (i.e. not cbind-like) usage of data.table() +# # changed in v1.12.4 ? +ans = data.table(V1=1, V2=2) # -------------------- +test(2058.01, data.table( data.table(1), 2), ans) # no +test(2058.02, as.data.table(list(data.table(1), 2)), ans) # no +test(2058.03, cbind(data.table(1), 2), ans) # no +ans = data.table(V1=1, V2=list(2)) # 'basic' usage; i.e. not cbind-like +test(2058.04, sapply(ans, class), c(V1="numeric", V2="list")) # no +test(2058.05, data.table( data.table(1), list(2) ), ans) # yes +test(2058.06, as.data.table(list(data.table(1), list(2))), ans) # yes +test(2058.07, cbind(data.table(1), list(2)), ans) # yes +ans = data.table(V1=1:2, V2=list(c("a","b"),"a")) +test(2058.08, sapply(ans, class), c(V1="integer", V2="list")) # no +test(2058.09, data.table( data.table(1:2), list(c("a","b"),"a") ), ans) # yes +test(2058.10, as.data.table(list(data.table(1:2), list(c("a","b"),"a"))), ans) # yes +test(2058.11, cbind(data.table(1:2), list(c("a","b"),"a")), ans) # yes + ################################### # Add new tests above this line # diff --git a/src/init.c b/src/init.c index 757ff6104..d3d2e6f39 100644 --- a/src/init.c +++ b/src/init.c @@ -26,7 +26,6 @@ SEXP vecseq(); SEXP setlistelt(); SEXP setmutable(); SEXP address(); -SEXP copyNamedInList(); SEXP expandAltRep(); SEXP fmelt(); SEXP fcast(); @@ -111,7 +110,6 @@ R_CallMethodDef callMethods[] = { {"Csetlistelt", (DL_FUNC) &setlistelt, -1}, {"Csetmutable", (DL_FUNC) &setmutable, -1}, {"Caddress", (DL_FUNC) &address, -1}, -{"CcopyNamedInList", (DL_FUNC) ©NamedInList, -1}, {"CexpandAltRep", (DL_FUNC) &expandAltRep, -1}, {"Cfmelt", (DL_FUNC) &fmelt, -1}, {"Cfcast", (DL_FUNC) &fcast, -1}, diff --git a/src/wrappers.c b/src/wrappers.c index 95f2bcd0a..2dc98264a 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -80,32 +80,11 @@ SEXP address(SEXP x) return(mkString(buffer)); } -SEXP copyNamedInList(SEXP x) -{ - // As from R 3.1.0 list() no longer copies NAMED inputs - // Since data.table allows subassignment by reference, we need a way to copy NAMED inputs, still. - // But for many other applications (such as in j and elsewhere internally) the new non-copying list() in R 3.1.0 is very welcome. - - // This is intended to be called just after list(...) in data.table(). It isn't for use on a single data.table, as - // member columns of a list aren't marked as NAMED when the VECSXP is. - - // For now, this makes the old behaviour of list() in R<3.1.0 available for use, where we need it. - - if (TYPEOF(x) != VECSXP) error("x isn't a VECSXP"); - for (int i=0; i