From d1c9d89b25c5785eb7d27facaec1526d1b0b35da Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 23 Jul 2019 19:14:06 +0800 Subject: [PATCH 1/4] Closes #3723 and #1459 -- use proper NA for integer64 on assign --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 8 ++++++++ src/assign.c | 10 +++++++++- src/data.table.h | 1 + 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 411a58985..0d2db0b3e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -198,6 +198,8 @@ 24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report. +25. `integer64` defined on a subset of a new column would leave "gibberish" on the remaining rows, [#3723](https://github.com/Rdatatable/data.table/issues/3723). A bug in `rbindlist` with the same root cause was also fixed, [#1459](https://github.com/Rdatatable/data.table/issues/1459). Thanks @shrektan and @jangorecki for the reports. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3a07b0fd0..8552f1a43 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15428,6 +15428,14 @@ test(2071.10, dcast(data.table(a=1, b=1, l=list(list(1))), a ~ b, value.var='l') test(2071.11, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'), data.table(a=1, `2`=3, key='a')) +# partial instantiation of integer64 column was creating NA_REAL, not INT64_MIN +if (test_bit64) { + # sub-assign from #3723 + test(2072.1, data.table(x=1:2)[1, y := bit64::as.integer64(0L)]$y, as.integer64(c(0L, NA))) + # rbindlist(fill=TRUE) from #1459 + test(2072.2, rbind(data.table(a=1:2, b=as.integer64(c(1,NA))), data.table(a=3L), fill=TRUE)$b, as.integer64(c(1, NA, NA))) +} + ################################### # Add new tests above this line # ################################### diff --git a/src/assign.c b/src/assign.c index 01a6147ec..6f0a94d62 100644 --- a/src/assign.c +++ b/src/assign.c @@ -489,7 +489,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) SEXP RHS; if (coln+1 > oldncol) { // new column - SET_VECTOR_ELT(dt, coln, newcol=allocNAVector(TYPEOF(thisvalue), nrow)); + SET_VECTOR_ELT(dt, coln, newcol=allocNAVectorLike(thisvalue, nrow)); // initialize with NAs for when 'rows' is a subset and it doesn't touch // do not try to save the time to NA fill (contiguous branch free assign anyway) since being // sure all items will be written to (isNull(rows), length(rows), vlen<1, targetlen) is not worth the risk. @@ -1040,6 +1040,14 @@ SEXP allocNAVector(SEXPTYPE type, R_len_t n) UNPROTECT(1); return(v); } +// #3723 -- build NA vector like x -- mainly used to copy attributes pass this to writeNA +SEXP allocNAVectorLike(SEXP x, R_len_t n) { + SEXP v = PROTECT(allocVector(TYPEOF(x), n)); + copyMostAttrib(x, v); + writeNA(v, 0, n); + UNPROTECT(1); + return(v); +} static SEXP *saveds=NULL; static R_len_t *savedtl=NULL, nalloc=0, nsaved=0; diff --git a/src/data.table.h b/src/data.table.h index 35e17e487..dd2364383 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -106,6 +106,7 @@ SEXP SelfRefSymbol; // assign.c SEXP allocNAVector(SEXPTYPE type, R_len_t n); +SEXP allocNAVectorLike(SEXP x, R_len_t n); void writeNA(SEXP v, const int from, const int n); void savetl_init(), savetl(SEXP s), savetl_end(); int checkOverAlloc(SEXP x); From 68a11a8b977ae441b52bdafa710b947d70ebee97 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 13 Aug 2019 10:45:48 +0800 Subject: [PATCH 2/4] small test fix --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e11b2b1c1..cc2839a61 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15650,7 +15650,7 @@ test(2074.41, fread('a\n1', na.strings='9', verbose=TRUE), output='One or more o # partial instantiation of integer64 column was creating NA_REAL, not INT64_MIN if (test_bit64) { # sub-assign from #3723 - test(2072.1, data.table(x=1:2)[1, y := bit64::as.integer64(0L)]$y, as.integer64(c(0L, NA))) + test(2072.1, data.table(x=1:2)[1, y := as.integer64(0L)]$y, as.integer64(c(0L, NA))) # rbindlist(fill=TRUE) from #1459 test(2072.2, rbind(data.table(a=1:2, b=as.integer64(c(1,NA))), data.table(a=3L), fill=TRUE)$b, as.integer64(c(1, NA, NA))) } From b8d23c7ee15ff4a485caf3c941a4663ce7d86149 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 23 Aug 2019 17:51:13 -0700 Subject: [PATCH 3/4] use allocVectorNALike in dogroups.c, and use new Rinherits in writeNA for nanotime --- src/assign.c | 6 ++++-- src/dogroups.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/assign.c b/src/assign.c index 6f0a94d62..30c87b91f 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1003,7 +1003,7 @@ void writeNA(SEXP v, const int from, const int n) for (int i=from; i<=to; ++i) vd[i] = NA_INTEGER; } break; case REALSXP : { - if (INHERITS(v, char_integer64)) { + if (Rinherits(v, char_integer64)) { // Rinherits covers nanotime too which inherits from integer64 via S4 extends int64_t *vd = (int64_t *)REAL(v); for (int i=from; i<=to; ++i) vd[i] = INT64_MIN; } else { @@ -1040,8 +1040,10 @@ SEXP allocNAVector(SEXPTYPE type, R_len_t n) UNPROTECT(1); return(v); } -// #3723 -- build NA vector like x -- mainly used to copy attributes pass this to writeNA + SEXP allocNAVectorLike(SEXP x, R_len_t n) { + // writeNA needs the attribute retained to write NA_INTEGER64, #3723 + // TODO: remove allocNAVector above when usage in fastmean.c, fcast.c and fmelt.c can be adjusted SEXP v = PROTECT(allocVector(TYPEOF(x), n)); copyMostAttrib(x, v); writeNA(v, 0, n); diff --git a/src/dogroups.c b/src/dogroups.c index 2b7fa7c66..0e88fb5b7 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -305,13 +305,13 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX if (isNull(target)) { // first time adding to new column if (TRUELENGTH(dt) < INTEGER(lhs)[j]) error("Internal error: Trying to add new column by reference but tl is full; alloc.col should have run first at R level before getting to this point in dogroups"); // # nocov - tmp = PROTECT(allocNAVector(TYPEOF(RHS), LENGTH(VECTOR_ELT(dt,0)))); + tmp = PROTECT(allocNAVectorLike(RHS, LENGTH(VECTOR_ELT(dt,0)))); // increment length only if the allocation passes, #1676 SETLENGTH(dtnames, LENGTH(dtnames)+1); SETLENGTH(dt, LENGTH(dt)+1); SET_VECTOR_ELT(dt, INTEGER(lhs)[j]-1, tmp); UNPROTECT(1); - // Even if we could know reliably to switch from allocNAVector to allocVector for slight speedup, user code could still contain a switched halt, and in that case we'd want the groups not yet done to have NA rather than uninitialized or 0. + // Even if we could know reliably to switch from allocNAVectorLike to allocVector for slight speedup, user code could still contain a switched halt, and in that case we'd want the groups not yet done to have NA rather than uninitialized or 0. // dtnames = getAttrib(dt, R_NamesSymbol); // commented this here and added it on the beginning to fix #4990 SET_STRING_ELT(dtnames, INTEGER(lhs)[j]-1, STRING_ELT(newnames, INTEGER(lhs)[j]-origncol-1)); target = VECTOR_ELT(dt,INTEGER(lhs)[j]-1); From a88f37397c3999368f6cbf41650d28b3ca37327e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 23 Aug 2019 17:55:01 -0700 Subject: [PATCH 4/4] added PR number to comment --- src/assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/assign.c b/src/assign.c index 30c87b91f..ac56f1759 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1043,7 +1043,7 @@ SEXP allocNAVector(SEXPTYPE type, R_len_t n) SEXP allocNAVectorLike(SEXP x, R_len_t n) { // writeNA needs the attribute retained to write NA_INTEGER64, #3723 - // TODO: remove allocNAVector above when usage in fastmean.c, fcast.c and fmelt.c can be adjusted + // TODO: remove allocNAVector above when usage in fastmean.c, fcast.c and fmelt.c can be adjusted; see comments in PR3724 SEXP v = PROTECT(allocVector(TYPEOF(x), n)); copyMostAttrib(x, v); writeNA(v, 0, n);