Skip to content

Commit

Permalink
Merge branch 'master' into fread-date-idate
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelChirico authored Jun 1, 2024
2 parents 78b9066 + 35f2979 commit c8febb9
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check-occasional.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
on:
schedule:
- cron: '17 13 23 * *' # 22nd of month at 13:17 UTC
- cron: '17 13 28 * *' # 28th of month at 13:17 UTC

# A more complete suite of checks to run monthly; each PR/merge need not pass all these, but they should pass before CRAN release
name: R-CMD-check-occasional
Expand Down
95 changes: 57 additions & 38 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,11 @@ with_c_collate = function(expr) {
expr
}

# strrep is used many times in tests, but is from R 3.3.0, so use this equivalent if it is missing.
if (!exists("strrep", "package:base")) {
strrep = function(x, times) mapply(function(x, times) paste(rep(x, times), collapse=""), rep_len(x, length(times)), times, USE.NAMES=FALSE)
}

##########################
.do_not_rm = ls() # objects that exist at this point should not be removed by rm_all(); e.g. test_*, base_messages, Ctest_dt_win_snprintf, prevtest, etc
##########################
Expand Down Expand Up @@ -18428,44 +18433,58 @@ test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data.
rm(.datatable.aware)

# tests for trunc.char handling wide characters # 5096
accented_a = "\u0061\u0301"
ja_ichi = "\u4E00"
ja_ni = "\u4E8C"
ja_ko = "\u3053"
ja_n = "\u3093"
dots = "..."
clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output
# Tests for combining character latin a and acute accent, single row
DT = data.table(strrep(accented_a, 4L))
test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L))
test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots))
test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots))
# Tests for full-width japanese character ichi, single row
DT = data.table(strrep(ja_ichi, 4L))
test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L))
test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots))
test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots))
# Tests for multiple, different length combining character rows
DT = data.table(strrep(accented_a, 1L:4L))
test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá"))
test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá..."))
test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á..."))
# Tests for multiple, different length full-width characters
DT = data.table(strrep(ja_ichi, 1L:4L))
test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一"))
test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一..."))
test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一..."))
# Tests for combined characters, multiple columns
DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")
test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa")
test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa")
test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...")
test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...")
# Tests for multiple columns, multiple rows
DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3))
test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんんん ááá"))
test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんん... ááá"))
test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á..."))
local({
lc_ctype = Sys.getlocale('LC_CTYPE')
Sys.setlocale('LC_CTYPE', "en_US.UTF-8") # Japanese multibyte characters require utf8
on.exit({Sys.setlocale('LC_CTYPE', lc_ctype)})
accented_a = "\u0061\u0301"
ja_ichi = "\u4E00"
ja_ni = "\u4E8C"
ja_ko = "\u3053"
ja_n = "\u3093"
dots = "..."
clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output
# Tests for combining character latin a and acute accent, single row
DT = data.table(strrep(accented_a, 4L))
test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L))
test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots))
test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots))
# Tests for full-width japanese character ichi, single row
DT = data.table(strrep(ja_ichi, 4L))
test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L))
test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots))
test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots))
# Tests for multiple, different length combining character rows
DT = data.table(strrep(accented_a, 1L:4L))
test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 1:4L))
test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(accented_a, 1:3), paste0(strrep(accented_a, 3L), dots)))
test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(accented_a, rep(paste0(accented_a, dots), 3L)))
# Tests for multiple, different length full-width characters
DT = data.table(strrep(ja_ichi, 1L:4L))
test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 1:4L))
test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(ja_ichi, 1:3), paste0(strrep(ja_ichi, 3L), dots)))
test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(ja_ichi, rep(paste0(ja_ichi, dots), 3L)))
# Tests for combined characters, multiple columns
DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")
test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa"))
test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa"))
test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2), paste0(strrep(ja_ko, 2), dots) , strrep(accented_a, 2), "aa..."))
test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, paste0(ja_ni, dots), paste0(ja_ko, dots), paste0(accented_a, dots), "a..."))
# Tests for multiple columns, multiple rows
DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3))
test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]),
c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)),
paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)),
paste(strrep(ja_ko, 3L), strrep(ja_n, 4L), strrep(accented_a, 3L))))
test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]),
c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)),
paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)),
paste(strrep(ja_ko, 3L), paste0(strrep(ja_n, 3L), dots), strrep(accented_a, 3L))))
test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]),
c(paste0(ja_ko, " ", paste0(ja_n, dots), " ", paste0(accented_a, dots)),
paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "),
paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" ")))
})

# allow 1-D matrix in j for consistency, #783
DT=data.table(a = rep(1:2, 3), b = 1:6)
Expand Down
2 changes: 1 addition & 1 deletion man/fread.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC"
\item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.}
\item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. }
\item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. }
\item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. }
\item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.}
\item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. }
\item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.}
\item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
Expand Down
6 changes: 5 additions & 1 deletion man/special-symbols.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
\alias{.N}
\alias{.EACHI}
\alias{.NGRP}
\alias{.NATURAL}
\title{ Special symbols }
\description{
\code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. \code{.I} can be used in \code{by} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}.
\code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}.
\code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}, \code{.NATURAL} is a symbol passed to \code{on}; i.e. \code{on=.NATURAL}
}
\details{
The bindings of these variables are locked and attempting to assign to them will generate an error. If you wish to manipulate \code{.SD} before returning it, take a \code{copy(.SD)} first (see FAQ 4.5). Using \code{:=} in the \code{j} of \code{.SD} is reserved for future use as a (tortuously) flexible way to update \code{DT} by reference by group (even when groups are not contiguous in an ad hoc by).
Expand All @@ -29,6 +30,8 @@
\code{.EACHI} is defined as \code{NULL} but its value is not used. Its usage is \code{by=.EACHI} (or \code{keyby=.EACHI}) which invokes grouping-by-each-row-of-i; see \code{\link{data.table}}'s \code{by} argument for more details.

\code{.NATURAL} is defined as \code{NULL} but its value is not used. Its usage is \code{on=.NATURAL} (alternative of \code{X[on=Y]}) which joins two tables on their common column names, performing a natural join; see \code{\link{data.table}}'s \code{on} argument for more details.
Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples.
}
\seealso{
Expand All @@ -54,6 +57,7 @@ DT[, c(.(y=max(y)), lapply(.SD, min)),
DT[, grp := .GRP, by=x] # add a group counter
DT[, grp_pct := .GRP/.NGRP, by=x] # add a group "progress" counter
X[, DT[.BY, y, on="x"], by=x] # join within each group
DT[X, on=.NATURAL] # join X and DT on common column similar to X[on=Y]
# .N can be different in i and j
DT[{cat(sprintf('in i, .N is \%d\n', .N)); a < .N/2},
Expand Down

0 comments on commit c8febb9

Please sign in to comment.