Merge branch 'master' into fread-date-idate

Rdatatable · Jun 1, 2024 · c8febb9 · c8febb9
2 parents 78b9066 + 35f2979
commit c8febb9
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 41 deletions.
diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml
@@ -1,6 +1,6 @@
 on:
   schedule:
-   - cron: '17 13 23 * *' # 22nd of month at 13:17 UTC
+   - cron: '17 13 28 * *' # 28th of month at 13:17 UTC
 
 # A more complete suite of checks to run monthly; each PR/merge need not pass all these, but they should pass before CRAN release
 name: R-CMD-check-occasional

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -207,6 +207,11 @@ with_c_collate = function(expr) {
   expr
 }
 
+# strrep is used many times in tests, but is from R 3.3.0, so use this equivalent if it is missing.
+if (!exists("strrep", "package:base")) {
+  strrep = function(x, times) mapply(function(x, times) paste(rep(x, times), collapse=""), rep_len(x, length(times)), times, USE.NAMES=FALSE)
+}
+
 ##########################
 .do_not_rm = ls()  # objects that exist at this point should not be removed by rm_all(); e.g. test_*, base_messages, Ctest_dt_win_snprintf, prevtest, etc
 ##########################
@@ -18428,44 +18433,58 @@ test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data.
 rm(.datatable.aware)
 
 # tests for trunc.char handling wide characters # 5096
-accented_a = "\u0061\u0301"
-ja_ichi = "\u4E00"
-ja_ni = "\u4E8C"
-ja_ko = "\u3053"
-ja_n = "\u3093"
-dots = "..."
-clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output
-# Tests for combining character latin a and acute accent, single row
-DT = data.table(strrep(accented_a, 4L))
-test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L))
-test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots))
-test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots))
-# Tests for full-width japanese character ichi, single row
-DT = data.table(strrep(ja_ichi, 4L))
-test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L))
-test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots))
-test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots))
-# Tests for multiple, different length combining character rows
-DT = data.table(strrep(accented_a, 1L:4L))
-test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá"))
-test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá..."))
-test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á..."))
-# Tests for multiple, different length full-width characters
-DT = data.table(strrep(ja_ichi, 1L:4L))
-test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一"))
-test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一..."))
-test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一..."))
-# Tests for combined characters, multiple columns
-DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")
-test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa")
-test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa")
-test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...")
-test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...")
-# Tests for multiple columns, multiple rows
-DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3))
-test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ     んん ááá", "ここ   んんん ááá", "こここ んんんん ááá"))
-test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ      んん ááá", "ここ    んんん ááá", "こここ んんん... ááá"))
-test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á..."))
+local({
+  lc_ctype = Sys.getlocale('LC_CTYPE')
+  Sys.setlocale('LC_CTYPE', "en_US.UTF-8") # Japanese multibyte characters require utf8
+  on.exit({Sys.setlocale('LC_CTYPE', lc_ctype)})
+  accented_a = "\u0061\u0301"
+  ja_ichi = "\u4E00"
+  ja_ni = "\u4E8C"
+  ja_ko = "\u3053"
+  ja_n = "\u3093"
+  dots = "..."
+  clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output
+  # Tests for combining character latin a and acute accent, single row
+  DT = data.table(strrep(accented_a, 4L))
+  test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L))
+  test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots))
+  test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots))
+  # Tests for full-width japanese character ichi, single row
+  DT = data.table(strrep(ja_ichi, 4L))
+  test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L))
+  test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots))
+  test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots))
+  # Tests for multiple, different length combining character rows
+  DT = data.table(strrep(accented_a, 1L:4L))
+  test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 1:4L))
+  test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(accented_a, 1:3), paste0(strrep(accented_a, 3L), dots)))
+  test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(accented_a, rep(paste0(accented_a, dots), 3L)))
+  # Tests for multiple, different length full-width characters
+  DT = data.table(strrep(ja_ichi, 1L:4L))
+  test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 1:4L))
+  test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(ja_ichi, 1:3), paste0(strrep(ja_ichi, 3L), dots)))
+  test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(ja_ichi, rep(paste0(ja_ichi, dots), 3L)))
+  # Tests for combined characters, multiple columns
+  DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")
+  test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa"))
+  test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa"))
+  test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2), paste0(strrep(ja_ko, 2), dots) , strrep(accented_a, 2), "aa..."))
+  test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, paste0(ja_ni, dots), paste0(ja_ko, dots), paste0(accented_a, dots), "a..."))
+  # Tests for multiple columns, multiple rows
+  DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3))
+  test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]),
+    c(paste0(ja_ko, "     ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)),
+    paste0(strrep(ja_ko, 2L), "   ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)),
+    paste(strrep(ja_ko, 3L), strrep(ja_n, 4L), strrep(accented_a, 3L))))
+  test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]),
+    c(paste0(ja_ko, "      ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)),
+    paste0(strrep(ja_ko, 2L), "    ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)),
+    paste(strrep(ja_ko, 3L), paste0(strrep(ja_n, 3L), dots), strrep(accented_a, 3L))))
+  test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]),
+    c(paste0(ja_ko, " ", paste0(ja_n, dots), " ", paste0(accented_a, dots)),
+    paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "),
+    paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" ")))
+})
 
 # allow 1-D matrix in j for consistency, #783
 DT=data.table(a = rep(1:2, 3), b = 1:6)

diff --git a/man/fread.Rd b/man/fread.Rd
@@ -52,7 +52,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC"
   \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.}
   \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}.  Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. }
   \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. }
-  \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. }
+  \item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.}
   \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. }
   \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.}
   \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }

diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd
@@ -8,10 +8,11 @@
 \alias{.N}
 \alias{.EACHI}
 \alias{.NGRP}
+\alias{.NATURAL}
 \title{ Special symbols }
 \description{
     \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. \code{.I} can be used in \code{by} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}.
-    \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}.
+    \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}, \code{.NATURAL} is a symbol passed to \code{on}; i.e. \code{on=.NATURAL}
 }
 \details{
     The bindings of these variables are locked and attempting to assign to them will generate an error. If you wish to manipulate \code{.SD} before returning it, take a \code{copy(.SD)} first (see FAQ 4.5). Using \code{:=} in the \code{j} of \code{.SD} is reserved for future use as a (tortuously) flexible way to update \code{DT} by reference by group (even when groups are not contiguous in an ad hoc by).
@@ -29,6 +30,8 @@
 
     \code{.EACHI} is defined as \code{NULL} but its value is not used. Its usage is \code{by=.EACHI} (or \code{keyby=.EACHI}) which invokes grouping-by-each-row-of-i; see \code{\link{data.table}}'s \code{by} argument for more details.
 
+    \code{.NATURAL} is defined as \code{NULL} but its value is not used. Its usage is \code{on=.NATURAL} (alternative of \code{X[on=Y]}) which joins two tables on their common column names, performing a natural join; see \code{\link{data.table}}'s \code{on} argument for more details.
+
     Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples.
 }
 \seealso{
@@ -54,6 +57,7 @@ DT[, c(.(y=max(y)), lapply(.SD, min)),
 DT[, grp := .GRP, by=x]                # add a group counter
 DT[, grp_pct := .GRP/.NGRP, by=x]      # add a group "progress" counter
 X[, DT[.BY, y, on="x"], by=x]          # join within each group
+DT[X, on=.NATURAL]                     # join X and DT on common column similar to X[on=Y]
 
 # .N can be different in i and j
 DT[{cat(sprintf('in i, .N is \%d\n', .N)); a < .N/2},