Merge branch 'master' into translation

Rdatatable · Nov 2, 2019 · b063414 · b063414
2 parents f554b79 + 92abb70
commit b063414
Show file tree

Hide file tree

Showing 10 changed files with 37 additions and 33 deletions.
diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd
@@ -127,7 +127,7 @@ install.packages("xml2")   # to check the 150 URLs in NEWS.md under --as-cran be
 q("no")
 R CMD build .
 R CMD check data.table_1.12.7.tar.gz --as-cran
-R CMD INSTALL data.table_1.12.7.tar.gz
+R CMD INSTALL data.table_1.12.7.tar.gz --html
 
 # Test C locale doesn't break test suite (#2771)
 echo LC_ALL=C > ~/.Renviron

diff --git a/.dev/revdep.R b/.dev/revdep.R
@@ -1,14 +1,14 @@
 # Run by package maintainer via these entries in ~/.bash_aliases :
-#   alias revdepr='cd ~/build/revdeplib/ && R_LIBS_SITE=none R_LIBS=~/build/revdeplib/ _R_CHECK_FORCE_SUGGESTS_=false R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R R'
 #   alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false'
+#   alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R ~/build/R-devel/bin/R'
 # revdep = reverse first-order dependency; i.e. the CRAN and Bioconductor packages which directly use data.table (765 at the time of writing)
 
 # Check that env variables have been set correctly:
 #   export R_LIBS_SITE=none
 #   export R_LIBS=~/build/revdeplib/
 #   export _R_CHECK_FORCE_SUGGESTS_=false
 stopifnot(identical(length(.libPaths()), 2L))     # revdeplib (writeable by me) and the pre-installed recommended R library (sudo writeable)
-stopifnot(identical(file.info(.libPaths())[,"uname"], c(as.vector(Sys.info()["user"]), "root")))
+stopifnot(identical(file.info(.libPaths())[,"uname"], rep(as.vector(Sys.info()["user"]), 2)))  # 2nd one is root when using default R rather than Rdevel
 stopifnot(identical(.libPaths()[1], getwd()))
 stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"false"))
 options(repos = c("CRAN"=c("http://cloud.r-project.org")))
@@ -18,10 +18,10 @@ options(repos = c("CRAN"=c("http://cloud.r-project.org")))
 # and BiocManager::install()) will call this script again recursively.
 Sys.unsetenv("R_PROFILE_USER")
 
-system("sudo R -e \"utils::update.packages('/usr/lib/R/library', ask=FALSE, checkBuilt=TRUE)\"")
+system(paste0("~/build/R-devel/bin/R -e \"utils::update.packages('",.libPaths()[2],"', ask=FALSE, checkBuilt=TRUE)\""))
 
 require(utils)  # only base is loaded when R_PROFILE_USER runs
-update.packages(ask=FALSE, checkBuilt=TRUE)
+update.packages(ask=FALSE, checkBuilt=FALSE)
 # if package not found on mirror, try manually a different one:
 #   install.packages("<pkg>", repos="http://cran.stat.ucla.edu/")
 #   update.packages(ask=FALSE)   # a repeat sometimes does more, keep repeating until none

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -309,7 +309,7 @@ integration: # merging all artifacts to produce single R repository and summarie
     # web/checks/check_results_$pkg.html
     - Rscript -e 'check.index("data.table", names(test.jobs))'
     # pkgdown merge
-    - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); if (length(f<-common_files("pkgdown","bus/integration/cran"))) message(paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0("  ", f)), collapse="\n")); q("no")'
+    - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); msg = if (length(f<-common_files("pkgdown","bus/integration/cran"))) paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0("  ", f)), collapse="\n") else "No overlapping files from pkgdown artifacts"; message(msg); q("no")'
     - mv pkgdown/* bus/integration/cran/
     # cleanup artifacts from other jobs
     - mkdir tmpbus

diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,8 @@
 
 ## NOTES
 
+1. Links in the manual were creating warnings when installing HTML, [#4000](https://github.com/Rdatatable/data.table/issues/4000). Thanks to Morgan Jacob.
+
 
 # data.table [v1.12.6](https://github.com/Rdatatable/data.table/milestone/18?closed=1)  (18 Oct 2019)
 

diff --git a/README.md b/README.md
@@ -18,11 +18,9 @@
 
 ---
 
-**Tuesday 22nd October 2019<br>
+**26 December 2019<br>
+Efficiency in data processing: data.table basics - Jan Gorecki, [Mumbai R@IISA 2019](https://r-iisa2019.rbind.io/)**
 <br>
-Matt Dowle will be in New York for [H2O World](https://www.h2o.ai/h2oworldnewyork/).<br>
-Please Ask-Me-Anything starting now: click http://sli.do and enter event code  "askmattdowle".<br>
-I'll answer the most voted questions during my session: https://h2o.ai/h2oworldny-livestream-reg**
 
 ---
 
@@ -50,18 +48,10 @@ I'll answer the most voted questions during my session: https://h2o.ai/h2oworldn
 
 ## Installation
 
-``` r
-install.packages("data.table")
-```
-
-### Development version
-
 ```r
-install.packages("data.table", repos="https://Rdatatable.gitlab.io/data.table")
-```
+install.packages("data.table")
 
-or update only if newer revision is available
-```r
+# latest development version:
 data.table::update.dev.pkg()
 ```
 
@@ -92,6 +82,7 @@ DT[Petal.Width > 1.0, mean(Petal.Length), by = Species]
 
 * [Introduction to data.table](https://cloud.r-project.org/web/packages/data.table/vignettes/datatable-intro.html) vignette
 * [Getting started](https://github.com/Rdatatable/data.table/wiki/Getting-started) wiki page
+* [Examples](https://rdatatable.gitlab.io/data.table/reference/data.table.html#examples) produced by `example(data.table)`
 
 ### Cheatsheets
 

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -7,9 +7,14 @@ template:
 development:
   version_tooltip: "Development version"
 
+home:
+  links:
+  - text: CRAN-like website
+    href: web/packages/data.table/index.html
+
 navbar:
   structure:
-    left:  [home, introduction, reference, articles, news, benchmarks]
+    left:  [home, introduction, articles, news, benchmarks, presentations, communityarticles, reference]
     right: [github]
   components:
     home:
@@ -18,9 +23,6 @@ navbar:
     introduction:
       text: Introduction
       href: articles/datatable-intro.html
-    reference:
-      text: Manual
-      href: reference/index.html
     articles:
       text: Vignettes
       menu:
@@ -43,11 +45,20 @@ navbar:
       - text: "Benchmarking data.table"
         href: articles/datatable-benchmarking.html
     news:
-      text: Changelog
+      text: News
       href: news/index.html
     benchmarks:
       text: Benchmarks
       href: https://h2oai.github.io/db-benchmark
+    presentations:
+      text: Presentations
+      href: https://github.com/Rdatatable/data.table/wiki/Presentations
+    communityarticles:
+      text: Articles
+      href: https://github.com/Rdatatable/data.table/wiki/Articles
+    reference:
+      text: Manual
+      href: reference/index.html
     github:
       icon: fab fa-github fa-lg
       href: https://github.com/Rdatatable/data.table
diff --git a/man/data.table.Rd b/man/data.table.Rd
@@ -231,7 +231,7 @@ column called \code{"keep"} containing \code{TRUE} and this is correct behaviour
 \seealso{ \code{\link{special-symbols}}, \code{\link{data.frame}}, \code{\link{[.data.frame}}, \code{\link{as.data.table}}, \code{\link{setkey}}, \code{\link{setorder}}, \code{\link{setDT}}, \code{\link{setDF}}, \code{\link{J}}, \code{\link{SJ}}, \code{\link{CJ}}, \code{\link{merge.data.table}}, \code{\link{tables}}, \code{\link{test.data.table}}, \code{\link{IDateTime}}, \code{\link{unique.data.table}}, \code{\link{copy}}, \code{\link{:=}}, \code{\link{setalloccol}}, \code{\link{truelength}}, \code{\link{rbindlist}}, \code{\link{setNumericRounding}}, \code{\link{datatable-optimize}}, \code{\link{fsetdiff}}, \code{\link{funion}}, \code{\link{fintersect}}, \code{\link{fsetequal}}, \code{\link{anyDuplicated}}, \code{\link{uniqueN}}, \code{\link{rowid}}, \code{\link{rleid}}, \code{\link{na.omit}}, \code{\link{frank}} }
 \examples{
 \dontrun{
-example(data.table)  # to run these examples at the prompt
+example(data.table)  # to run these examples yourself
 }
 DF = data.frame(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
 DT = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)

diff --git a/man/fread.Rd b/man/fread.Rd
@@ -42,11 +42,11 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()
   \item{skip}{ If 0 (default) start on the first line and from there finds the first row with a consistent number of columns. This automatically avoids irregular header information before the column names row. \code{skip>0} means ignore the first \code{skip} rows manually. \code{skip="string"} searches for \code{"string"} in the file (e.g. a substring of the column names row) and starts on that line (inspired by read.xls in package gdata). }
   \item{select}{ A vector of column names or numbers to keep, drop the rest. \code{select} may specify types too in the same way as \code{colClasses}; i.e., a vector of \code{colname=type} pairs, or a \code{list} of \code{type=col(s)} pairs. In all forms of \code{select}, the order that the columns are specified determines the order of the columns in the result. }
   \item{drop}{ Vector of column names or numbers to drop, keep the rest. }
-  \item{colClasses}{ As in \code{\link[utils]{read.csv}}; i.e., an unnamed vector of types corresponding to the columns in the file, or a named vector specifying types for a subset of the columns by name. The default, \code{NULL} means types are inferred from the data in the file. Further, \code{data.table} supports a named \code{list} of vectors of column names \emph{or numbers} where the \code{list} names are the class names; see examples. The \code{list} form makes it easier to set a batch of columns to be a particular class. When column numbers are used in the \code{list} form, they refer to the column number in the file not the column number after \code{select} or \code{drop} has been applied.
+  \item{colClasses}{ As in \code{\link[utils:read.table]{utils::read.csv}}; i.e., an unnamed vector of types corresponding to the columns in the file, or a named vector specifying types for a subset of the columns by name. The default, \code{NULL} means types are inferred from the data in the file. Further, \code{data.table} supports a named \code{list} of vectors of column names \emph{or numbers} where the \code{list} names are the class names; see examples. The \code{list} form makes it easier to set a batch of columns to be a particular class. When column numbers are used in the \code{list} form, they refer to the column number in the file not the column number after \code{select} or \code{drop} has been applied.
     If type coercion results in an error, introduces \code{NA}s, or would result in loss of accuracy, the coercion attempt is aborted for that column with warning and the column's type is left unchanged. If you really desire data loss (e.g. reading \code{3.14} as \code{integer}) you have to truncate such columns afterwards yourself explicitly so that this is clear to future readers of your code.
   }
-  \item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{base::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". }
-  \item{dec}{ The decimal separator as in \code{base::read.csv}. If not "." (default) then usually ",". See details. }
+  \item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{utils::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". }
+  \item{dec}{ The decimal separator as in \code{utils::read.csv}. If not "." (default) then usually ",". See details. }
   \item{col.names}{ A vector of optional names for the variables (columns). The default is to use the header column if present or detected, or if not "V" followed by the column number. This is applied after \code{check.names} and before \code{key} and \code{index}. }
   \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.}
   \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}.  Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. }
@@ -63,7 +63,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()
   \item{keepLeadingZeros}{If TRUE a column containing numeric data with leading zeros will be read as character, otherwise leading zeros will be removed and converted to numeric.}
   \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. }
   \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. }
-  \item{tmpdir}{ Directory to use as the \code{tmpdir} argument for any \code{tempfile} calls, e.g. when the input is a URL or a shell command. The default is \code{tempdir()} which can be controlled by setting \code{TMPDIR} before starting the R session; see \code{\link[base]{tempdir}}. }
+  \item{tmpdir}{ Directory to use as the \code{tmpdir} argument for any \code{tempfile} calls, e.g. when the input is a URL or a shell command. The default is \code{tempdir()} which can be controlled by setting \code{TMPDIR} before starting the R session; see \code{\link[base:tempfile]{base::tempdir}}. }
 }
 \details{
 

diff --git a/man/like.Rd b/man/like.Rd
@@ -22,13 +22,13 @@ vector \%flike\% pattern
    \item{fixed}{ \code{logical}; should \code{pattern} be interpreted as a literal string (i.e., ignoring regular expressions)? }
 }
 \details{
-  Internally, \code{like} is essentially a wrapper around \code{\link[base]{grepl}}, except that it is smarter about handling \code{factor} input (\code{base::grep} uses slow \code{as.character} conversion).
+  Internally, \code{like} is essentially a wrapper around \code{\link[base:grep]{base::grepl}}, except that it is smarter about handling \code{factor} input (\code{base::grep} uses slow \code{as.character} conversion).
 }
 \value{
     Logical vector, \code{TRUE} for items that match \code{pattern}.
 }
 \note{ Current implementation does not make use of sorted keys. }
-\seealso{ \code{\link[base]{grepl}} }
+\seealso{ \code{\link[base:grep]{base::grepl}} }
 \examples{
 DT = data.table(Name=c("Mary","George","Martha"), Salary=c(2,3,4))
 DT[Name \%like\% "^Mar"]

diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd
@@ -19,7 +19,7 @@
   A length 1 \code{integer}. The old value is returned by \code{setDTthreads} so you can store that prior value and pass it to \code{setDTthreads()} again after the section of your code where you control the number of threads.
 }
 \details{
-  \code{data.table} automatically switches to single threaded mode upon fork (the mechanism used by \code{\link[parallel]{mclapply}} and the foreach package). Otherwise, nested parallelism would very likely overload your CPUs and result in much slower execution. As \code{data.table} becomes more parallel internally, we expect explicit user parallelism to be needed less often. The \code{restore_after_fork} option controls what happens after the explicit fork parallelism completes. It needs to be at C level so it is not a regular R option using \code{options()}. By default \code{data.table} will be multi-threaded again; restoring the prior setting of \code{getDTthreads()}. But problems have been reported in the past on Mac with Intel OpenMP libraries whereas success has been reported on Linux. If you experience problems after fork, start a new R session and change the default behaviour by calling \code{setDTthreads(restore_after_fork=FALSE)} before retrying. Please raise issues on the data.table GitHub issues page.
+  \code{data.table} automatically switches to single threaded mode upon fork (the mechanism used by \code{parallel::mclapply} and the foreach package). Otherwise, nested parallelism would very likely overload your CPUs and result in much slower execution. As \code{data.table} becomes more parallel internally, we expect explicit user parallelism to be needed less often. The \code{restore_after_fork} option controls what happens after the explicit fork parallelism completes. It needs to be at C level so it is not a regular R option using \code{options()}. By default \code{data.table} will be multi-threaded again; restoring the prior setting of \code{getDTthreads()}. But problems have been reported in the past on Mac with Intel OpenMP libraries whereas success has been reported on Linux. If you experience problems after fork, start a new R session and change the default behaviour by calling \code{setDTthreads(restore_after_fork=FALSE)} before retrying. Please raise issues on the data.table GitHub issues page.
 
   The number of logical CPUs is determined by the OpenMP function \code{omp_get_num_procs()} whose meaning may vary across platforms and OpenMP implementations. \code{setDTthreads()} will not allow more than this limit. Neither will it allow more than \code{omp_get_thread_limit()} nor the current value of \code{Sys.getenv("OMP_THREAD_LIMIT")}. Note that CRAN's daily test system (results for data.table \href{https://cran.r-project.org/web/checks/check_results_data.table.html}{here}) sets \code{OMP_THREAD_LIMIT} to 2 and should always be respected; e.g., if you have written a package that uses data.table and your package is to be released on CRAN, you should not change \code{OMP_THREAD_LIMIT} in your package to a value greater than 2.