Skip to content

Commit

Permalink
fsetequal handle last col a character (#3219)
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki authored and mattdowle committed Dec 14, 2018
1 parent 550b8a7 commit af601da
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 8 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Expand Up @@ -53,6 +53,8 @@
attr(as.data.table(datasets::BOD)[2],"reference") # was NULL now "A1.4, p. 270"
```

9. `fsetequal` now handles properly datasets having last column a character, closes [#2318](https://github.com/Rdatatable/data.table/issues/2318). Thanks to @pschil and @franknarf1 for reporting.

#### NOTES

1. When data.table first loads it now checks the DLL's MD5. This is to detect installation issues on Windows when you upgrade and i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. Thanks to Gabor Csardi for the suggestion to use `tools::checkMD5sums()`.
Expand Down
25 changes: 17 additions & 8 deletions R/setops.R
Expand Up @@ -154,9 +154,7 @@ all.equal.data.table <- function(target, current, trim.levels=TRUE, check.attrib
paste0(names(targetTypes)[w],"(",paste(targetTypes[w],currentTypes[w],sep="!="),")")
,collapse=" ")))
}
}

if (check.attributes) {

# check key
k1 = key(target)
k2 = key(current)
Expand Down Expand Up @@ -216,18 +214,28 @@ all.equal.data.table <- function(target, current, trim.levels=TRUE, check.attrib
return(sprintf("Dataset 'current' has duplicate rows while 'target' doesn't%s", tolerance.msg))
}
}
jn.on = if (target_dup && current_dup) {
target = shallow(target)[, ".seqn" := rowidv(target)]
current = shallow(current)[, ".seqn" := rowidv(current)]
c(".seqn", setdiff(names(target), ".seqn"))
} else names(target)
# handling 'tolerance' for factor cols - those `msg` will be returned only when equality with tolerance will fail
if (any(vapply_1b(target,is.factor)) && !identical(tolerance, 0)) {
if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error
stop("Factor columns and ignore.row.order cannot be used with non 0 tolerance argument")
msg = c(msg, "Using factor columns together together with ignore.row.order, this force 'tolerance' argument to 0")
tolerance = 0
}
jn.on = copy(names(target)) # default, possible altered later on
char.cols = vapply_1c(target,typeof)=="character"
if (!identical(tolerance, 0)) { # handling character columns only for tolerance!=0
if (all(char.cols)) {
msg = c(msg, "Both datasets have character columns only, together with ignore.row.order this force 'tolerance' argument to 0, for character columns it does not have effect")
tolerance = 0
} else if (any(char.cols)) { # character col cannot be the last one during rolling join
jn.on = jn.on[c(which(char.cols), which(!char.cols))]
}
}
if (target_dup && current_dup) {
target = shallow(target)[, ".seqn" := rowidv(target)]
current = shallow(current)[, ".seqn" := rowidv(current)]
jn.on = c(".seqn", jn.on)
}
# roll join to support 'tolerance' argument, conditional to retain support for factor when tolerance=0
ans = if (identical(tolerance, 0)) target[current, nomatch=NA, which=TRUE, on=jn.on] else {
ans1 = target[current, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on]
Expand All @@ -238,6 +246,7 @@ all.equal.data.table <- function(target, current, trim.levels=TRUE, check.attrib
msg = c(msg, sprintf("Dataset 'current' has rows not present in 'target'%s%s", if (target_dup || current_dup) " or present in different quantity" else "", tolerance.msg))
return(msg)
}
# rolling join other way around
ans = if (identical(tolerance, 0)) current[target, nomatch=NA, which=TRUE, on=jn.on] else {
ans1 = current[target, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on]
ans2 = current[target, roll=-tolerance, rollends=TRUE, which=TRUE, on=jn.on]
Expand Down
20 changes: 20 additions & 0 deletions inst/tests/tests.Rraw
Expand Up @@ -13091,6 +13091,26 @@ test(1968.6, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294
DT[2,C:=NA]
test(1968.7, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 0)), warning="coerced to 'numeric'")

# fsetequal and last col a character #2318
dt.1 <- data.table(Id=(1:10))
dt.2 <- data.table(Id=(1:10))
dt.2[1, Id:=99]
test(1969.1, fsetequal(dt.1, dt.2), FALSE)
dt.1[, Id := as.character(Id)]
dt.2[, Id := as.character(Id)]
test(1969.2, fsetequal(dt.1, dt.2), FALSE)
x = data.table(v = "foo", a = "my string")
y = data.table(v = "foo", a = "not my string")
test(1969.3, fsetequal(x, y), FALSE)
x = data.table(v = "foo", a = "my string")
y = data.table(v = "foo", a = "not my string")
x = rbind(x, x)
y = rbind(y, y)
test(1969.4, fsetequal(x, y), FALSE)
x = rbind(x, y)
y = rbind(y, x)
test(1969.5, fsetequal(x, y), FALSE)


###################################
# Add new tests above this line #
Expand Down

0 comments on commit af601da

Please sign in to comment.