Skip to content

Commit

Permalink
int i and nomatch arg (#4353)
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Aug 22, 2021
1 parent 1bc9178 commit 30b6f0e
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 42 deletions.
18 changes: 18 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,24 @@
```R
mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl)
```

23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`.

```R
DT = data.table(A=1:3)
DT[c(1L, NA, 3L, 5L)] # default nomatch=NA
# A
# <int>
# 1: 1
# 2: NA
# 3: 3
# 4: NA
DT[c(1L, NA, 3L, 5L), nomatch=NULL]
# A
# <int>
# 1: 1
# 2: 3
```

## BUG FIXES

Expand Down
39 changes: 21 additions & 18 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -221,12 +221,12 @@ replace_dot_alias = function(e) {
# TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one.
.unsafe.opt() #3585
missingnomatch = missing(nomatch)
if (is.null(nomatch)) nomatch = 0L # allow nomatch=NULL API already now, part of: https://github.com/Rdatatable/data.table/issues/857
if (!is.na(nomatch) && nomatch!=0L) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)")
nomatch = as.integer(nomatch)
nomatch0 = identical(nomatch,0) || identical(nomatch,0L) # for warning with row-numbers in i; #4353
if (nomatch0) nomatch=NULL # retain nomatch=0 backwards compatibility; #857
if (!is.na(nomatch) && !is.null(nomatch)) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)")
if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.")
if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which)
if (!is.na(nomatch) && is.na(which)) stopf("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.")
if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.")
if (!with && missing(j)) stopf("j must be provided when with=FALSE")
irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency.
notjoin = FALSE
Expand Down Expand Up @@ -313,7 +313,7 @@ replace_dot_alias = function(e) {
stopf(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i")
if (!missingnomatch) {
warningf("nomatch isn't relevant together with :=, ignoring nomatch")
nomatch=0L
nomatch=NULL
}
}
}
Expand Down Expand Up @@ -369,7 +369,7 @@ replace_dot_alias = function(e) {
if (isub %iscall% "!") {
notjoin = TRUE
if (!missingnomatch) stopf("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch.");
nomatch = 0L
nomatch = NULL
isub = isub[[2L]]
# #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!"
if (isub %iscall% "(" && !is.name(isub[[2L]]))
Expand All @@ -389,7 +389,7 @@ replace_dot_alias = function(e) {
on = o$on
## the following two are ignored if i is not a data.table.
## Since we are converting i to data.table, it is important to set them properly.
nomatch = 0L
nomatch = NULL
mult = "all"
}
else if (!is.name(isub)) {
Expand Down Expand Up @@ -509,8 +509,7 @@ replace_dot_alias = function(e) {
len__ = ans$lens
allGrp1 = all(ops==1L) # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time.
indices__ = if (length(ans$indices)) ans$indices else seq_along(f__) # also for #1991 fix
# length of input nomatch (single 0 or NA) is 1 in both cases.
# When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N
# When no match, len__ is 0 for nomatch=NULL and 1 for nomatch=NA, so len__ isn't .N
# If using secondary key of x, f__ will refer to xo
if (is.na(which)) {
w = if (notjoin) f__!=0L else is.na(f__)
Expand All @@ -533,7 +532,7 @@ replace_dot_alias = function(e) {
# Fix for #1092 and #1074
# TODO: implement better version of "any"/"all"/"which" to avoid
# unnecessary construction of logical vectors
if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L]
if (is.null(nomatch) && allLen1) irows = irows[irows != 0L]
} else {
if (length(xo) && missing(on))
stopf("Internal error. Cannot by=.EACHI when joining to an index, yet") # nocov
Expand All @@ -553,10 +552,10 @@ replace_dot_alias = function(e) {
} else {
if (!byjoin) { #1287 and #1271
irows = f__ # len__ is set to 1 as well, no need for 'pmin' logic
if (identical(nomatch,0L)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows
if (is.null(nomatch)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows
}
# TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"?
# TODO: how about when nomatch=0L, can we avoid allocating then as well?
# TODO: how about when nomatch=NULL, can we avoid allocating then as well?
}
if (length(xo) && length(irows)) {
irows = xo[irows] # TO DO: fsort here?
Expand Down Expand Up @@ -619,16 +618,20 @@ replace_dot_alias = function(e) {
else stopf("i evaluates to a logical vector length %d but there are %d rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.", length(i), nrow(x))
} else {
irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error)
irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=") # last argument is allowOverMax (NA when selecting, error when assigning)
# simplifies logic from here on: can assume positive subscripts (no zeros)
if (nomatch0) warning("Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)")
# warning only for this case where nomatch was ignored before v1.14.2; #3109
irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x),
is.null(jsub) || root!=":=", # allowOverMax (NA when selecting, error when assigning)
!is.null(nomatch)) # allowNA=false when nomatch=NULL #3109, true when nomatch=NA #3666
# simplifies logic from here on: can assume positive subscripts (no zeros), for nomatch=NULL also no NAs
# maintains Arun's fix for #2697 (test 1042)
# efficient in C with more detailed helpful messages when user mixes positives and negatives
# falls through quickly (no R level allocs) if all items are within range [1,max] with no zeros or negatives
# minor TO DO: can we merge this with check_idx in fcast.c/subset ?
}
}
if (notjoin) {
if (byjoin || !is.integer(irows) || is.na(nomatch)) stopf("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov
if (byjoin || !is.integer(irows) || !is.null(nomatch)) stopf("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov
irows = irows[irows!=0L]
if (verbose) {last.started.at=proc.time();catf("Inverting irows for notjoin done in ... ");flush.console()}
i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x))
Expand Down Expand Up @@ -843,7 +846,7 @@ replace_dot_alias = function(e) {
# in 1.8.3, but this failed test 876.
# TO DO: Add a test like X[i,sum(v),by=i.x2], or where by includes a join column (both where some i don't match).
# TO DO: Make xss directly, rather than recursive call.
if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset
if (is.null(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset
if (length(allbyvars)) { ############### TO DO TO DO TO DO ###############
if (verbose) catf("i clause present and columns used in by detected, only these subset: %s\n", brackify(allbyvars))
xss = `[.data.table`(x,irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends)
Expand Down Expand Up @@ -1290,7 +1293,7 @@ replace_dot_alias = function(e) {
}
ans = vector("list", length(ansvars))
ii = rep.int(indices__, len__) # following #1991 fix
# TODO: if (allLen1 && allGrp1 && (is.na(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i) [nomatch=0 should drop rows in i that have no match]
# TODO: if (allLen1 && allGrp1 && (!is.null(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i) [nomatch=NULL should drop rows in i that have no match]
# But rather than that complex logic here at R level to catch that and do a shallow copy for efficiency, just do the check inside CsubsetDT
# to see if it passed 1:nrow(x) and then CsubsetDT should do the shallow copy safely and centrally.
# That R level branch was taken out in PR #3213
Expand Down Expand Up @@ -1721,7 +1724,7 @@ replace_dot_alias = function(e) {
}
dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character?
# FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with
# nomatch=0L even now.. but not switching it on yet, will deal it separately.
# nomatch=NULL even now.. but not switching it on yet, will deal it separately.
if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) {
if (!length(ansvars) && !use.I) {
GForce = FALSE
Expand Down
32 changes: 31 additions & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -2293,7 +2293,8 @@ test(807, DT[!c("b","foo","c"),nomatch=0], error="not-join.*prefix is present on
test(808, DT[c("b","foo","c"),which=TRUE,nomatch=NA], INT(3:4,NA,5:6))
test(809, DT[c("b","foo","c"),which=TRUE,nomatch=0], INT(3:4,5:6))
test(810, DT[c("b","foo","c"),which=NA,nomatch=NA], 2L)
test(811, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0 would always return an empty vector[.] Please change or remove either which or nomatch")
test(811.1, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0|NULL would always return an empty vector[.] Please change or remove either which or nomatch")
test(811.2, DT[c("b","foo","c"),which=NA,nomatch=NULL], error="which=NA with nomatch=0|NULL would always return an empty vector[.] Please change or remove either which or nomatch")

# New notj for column names and positions when with=FALSE, #1384
DT = data.table(a=1:3,b=4:6,c=7:9)
Expand Down Expand Up @@ -17979,3 +17980,32 @@ test(2209.3, fread('A|B,C\n1|+,4\n2|-,5\n3|-,6\n', dec=','), data.table(A=1:3, '
test(2209.4, fread('A|B|C\n1|0,4|a\n2|0,5|b\n', dec=','), data.table(A=1:2, B=c(0.4,0.5), C=c("a","b"))) # ok before
test(2209.5, fread('A|B,C\n1|0,4\n2|0,5\n', dec=','), data.table(A=1:2, "B,C"=c(0.4,0.5)))

# respect `nomatch=NULL` for integer i, #3109 #3666
DT = data.table(x = 1:4)
test(2210.01, DT[c(1L, 5L, NA_integer_)], data.table(x=c(1L,NA_integer_,NA_integer_))) # default nomatch=NA
test(2210.02, DT[c(1L, 5L, NA_integer_), nomatch=NULL], data.table(x = 1L))
test(2210.03, DT[c(1L, 5L, NA_integer_), nomatch=0], data.table(x = 1L), warning="Please use nomatch=NULL")
test(2210.04, DT[c(1L, 5L, NA_integer_), x, nomatch=NULL], 1L)
test(2210.05, DT[c(1L, 5L, NA_integer_), x, nomatch=0], 1L, warning="Please use nomatch=NULL")
test(2210.06, DT[c(1:4,1:4), nomatch=NULL], data.table(x=c(1:4,1:4))) # early stopping convertNegAndZeroIdx
test(2210.07, DT[c(1:4,-1L), nomatch=NULL], error="Cannot mix positives and negatives")
test(2210.08, DT[c(1:4,NA_integer_,-1L), nomatch=NULL], error="Cannot mix positives and negatives")
test(2210.09, DT[c(1:4,NA_integer_,-1L), nomatch=0], error="Cannot mix positives and negatives", warning="Please use nomatch=NULL")
test(2210.10, DT[c(-1L,NA_integer_), nomatch=NULL], error="Cannot mix negatives and NA")
test(2210.11, DT[c(-1L,NA_integer_), nomatch=0], error="Cannot mix negatives and NA", warning="Please use nomatch=NULL")
test(2210.12, DT[NA, nomatch=NULL], data.table(x=integer()))
test(2210.13, DT[NA, nomatch=0], data.table(x=integer()), warning="Please use nomatch=NULL")
test(2210.14, DT[0L, nomatch=NULL], data.table(x=integer()))
test(2210.15, DT[0L, nomatch=0], data.table(x=integer()), warning="Please use nomatch=NULL")
test(2210.16, DT[0:1, nomatch=NULL], data.table(x=1L))
test(2210.17, DT[0:1, nomatch=0], data.table(x=1L), warning="Please use nomatch=NULL")
test(2210.18, DT[-1L, nomatch=NULL], data.table(x=2:4))
test(2210.19, DT[-1L, nomatch=0], data.table(x=2:4), warning="Please use nomatch=NULL")
test(2210.20, data.table()[1L, nomatch=NULL], data.table())
test(2210.21, data.table()[1L, nomatch=0], data.table(), warning="Please use nomatch=NULL")
test(2210.22, data.table()[-1L, nomatch=NULL], data.table(), warning="there are only 0 rows")
test(2210.23, data.table()[-1L, nomatch=0], data.table(), warning=c("Please use nomatch=NULL","there are only 0 rows"))
test(2210.24, DT[-c(1L,0L)], data.table(x=2:4)) # codecov gap, not related to nomatch
test(2210.25, DT[-c(1L,0L), nomatch=NULL], data.table(x=2:4))
test(2210.26, DT[-c(1L,0L), nomatch=0], data.table(x=2:4), warning="Please use nomatch=NULL")

3 changes: 1 addition & 2 deletions src/bmerge.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,7 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP
error(_("rollends must be a length 2 logical vector"));
rollends = LOGICAL(rollendsArg);

// nomatch arg
nomatch = INTEGER(nomatchArg)[0];
nomatch = isNull(nomatchArg) ? 0 : INTEGER(nomatchArg)[0];

// mult arg
if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL;
Expand Down
6 changes: 3 additions & 3 deletions src/nqrecreateindices.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ SEXP nqRecreateIndices(SEXP xo, SEXP len, SEXP indices, SEXP nArg, SEXP nomatch)
const int *iindices = INTEGER(indices);
const int *ilen = INTEGER(len);
const int *ixo = INTEGER(xo);
const int *inomatch = INTEGER(nomatch);
const int inomatch = isNull(nomatch) ? 0 : INTEGER(nomatch)[0];
int *inewstarts = INTEGER(newstarts);

for (int i=0; i<n; ++i) inewlen[i] = 0;
Expand All @@ -28,8 +28,8 @@ SEXP nqRecreateIndices(SEXP xo, SEXP len, SEXP indices, SEXP nArg, SEXP nomatch)
for (int i=0; i<n; ++i) {
if (ixo[j] <= 0 || j >= xn) {
// NA_integer_ = INT_MIN is checked in init.c
// j >= xn needed for special nomatch=0L case, see issue#4388 (due to xo[irows] from R removing '0' value in xo)
inewstarts[i] = inomatch[0];
// j >= xn needed for special nomatch=NULL case, see issue#4388 (due to xo[irows] from R removing '0' value in xo)
inewstarts[i] = inomatch;
j++; // newlen will be 1 for xo=NA and 0 for xo=0 .. but we need to increment by 1 for both
} else {
inewstarts[i] = tmp+1;
Expand Down

0 comments on commit 30b6f0e

Please sign in to comment.