fsetequal handle last col a character (#3219)

Rdatatable · Dec 14, 2018 · af601da · af601da
1 parent 550b8a7
commit af601da
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 8 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -53,6 +53,8 @@
     attr(as.data.table(datasets::BOD)[2],"reference")   # was NULL now "A1.4, p. 270"
     ```
 
+9. `fsetequal` now handles properly datasets having last column a character, closes [#2318](https://github.com/Rdatatable/data.table/issues/2318). Thanks to @pschil and @franknarf1 for reporting.
+
 #### NOTES
 
 1. When data.table first loads it now checks the DLL's MD5. This is to detect installation issues on Windows when you upgrade and i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. Thanks to Gabor Csardi for the suggestion to use `tools::checkMD5sums()`.

diff --git a/R/setops.R b/R/setops.R
@@ -154,9 +154,7 @@ all.equal.data.table <- function(target, current, trim.levels=TRUE, check.attrib
      paste0(names(targetTypes)[w],"(",paste(targetTypes[w],currentTypes[w],sep="!="),")")
             ,collapse=" ")))
     }
-  }
-
-  if (check.attributes) {
+
     # check key
     k1 = key(target)
     k2 = key(current)
@@ -216,18 +214,28 @@ all.equal.data.table <- function(target, current, trim.levels=TRUE, check.attrib
           return(sprintf("Dataset 'current' has duplicate rows while 'target' doesn't%s", tolerance.msg))
       }
     }
-    jn.on = if (target_dup && current_dup) {
-      target = shallow(target)[, ".seqn" := rowidv(target)]
-      current = shallow(current)[, ".seqn" := rowidv(current)]
-      c(".seqn", setdiff(names(target), ".seqn"))
-    } else names(target)
     # handling 'tolerance' for factor cols - those `msg` will be returned only when equality with tolerance will fail
     if (any(vapply_1b(target,is.factor)) && !identical(tolerance, 0)) {
       if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error
         stop("Factor columns and ignore.row.order cannot be used with non 0 tolerance argument")
       msg = c(msg, "Using factor columns together together with ignore.row.order, this force 'tolerance' argument to 0")
       tolerance = 0
     }
+    jn.on = copy(names(target)) # default, possible altered later on
+    char.cols = vapply_1c(target,typeof)=="character"
+    if (!identical(tolerance, 0)) { # handling character columns only for tolerance!=0
+      if (all(char.cols)) {
+        msg = c(msg, "Both datasets have character columns only, together with ignore.row.order this force 'tolerance' argument to 0, for character columns it does not have effect")
+        tolerance = 0
+      } else if (any(char.cols)) { # character col cannot be the last one during rolling join
+        jn.on = jn.on[c(which(char.cols), which(!char.cols))]
+      }
+    }
+    if (target_dup && current_dup) {
+      target = shallow(target)[, ".seqn" := rowidv(target)]
+      current = shallow(current)[, ".seqn" := rowidv(current)]
+      jn.on = c(".seqn", jn.on)
+    }
     # roll join to support 'tolerance' argument, conditional to retain support for factor when tolerance=0
     ans = if (identical(tolerance, 0)) target[current, nomatch=NA, which=TRUE, on=jn.on] else {
       ans1 = target[current, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on]
@@ -238,6 +246,7 @@ all.equal.data.table <- function(target, current, trim.levels=TRUE, check.attrib
       msg = c(msg, sprintf("Dataset 'current' has rows not present in 'target'%s%s", if (target_dup || current_dup) " or present in different quantity" else "", tolerance.msg))
       return(msg)
     }
+    # rolling join other way around
     ans = if (identical(tolerance, 0)) current[target, nomatch=NA, which=TRUE, on=jn.on] else {
       ans1 = current[target, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on]
       ans2 = current[target, roll=-tolerance, rollends=TRUE, which=TRUE, on=jn.on]

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -13091,6 +13091,26 @@ test(1968.6, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294
 DT[2,C:=NA]
 test(1968.7, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 0)), warning="coerced to 'numeric'")
 
+# fsetequal and last col a character #2318
+dt.1 <- data.table(Id=(1:10))
+dt.2 <- data.table(Id=(1:10))
+dt.2[1, Id:=99]
+test(1969.1, fsetequal(dt.1, dt.2), FALSE)
+dt.1[, Id := as.character(Id)]
+dt.2[, Id := as.character(Id)]
+test(1969.2, fsetequal(dt.1, dt.2), FALSE)
+x = data.table(v = "foo", a = "my string")
+y = data.table(v = "foo", a = "not my string")
+test(1969.3, fsetequal(x, y), FALSE)
+x = data.table(v = "foo", a = "my string")
+y = data.table(v = "foo", a = "not my string")
+x = rbind(x, x)
+y = rbind(y, y)
+test(1969.4, fsetequal(x, y), FALSE)
+x = rbind(x, y)
+y = rbind(y, x)
+test(1969.5, fsetequal(x, y), FALSE)
+
 
 ###################################
 #  Add new tests above this line  #