diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 9e40ec223..8239c0295 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -127,7 +127,7 @@ install.packages("xml2") # to check the 150 URLs in NEWS.md under --as-cran be q("no") R CMD build . R CMD check data.table_1.12.7.tar.gz --as-cran -R CMD INSTALL data.table_1.12.7.tar.gz +R CMD INSTALL data.table_1.12.7.tar.gz --html # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron diff --git a/.dev/revdep.R b/.dev/revdep.R index e7eb821a9..dd1546a1e 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -1,6 +1,6 @@ # Run by package maintainer via these entries in ~/.bash_aliases : -# alias revdepr='cd ~/build/revdeplib/ && R_LIBS_SITE=none R_LIBS=~/build/revdeplib/ _R_CHECK_FORCE_SUGGESTS_=false R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R R' # alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false' +# alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R ~/build/R-devel/bin/R' # revdep = reverse first-order dependency; i.e. the CRAN and Bioconductor packages which directly use data.table (765 at the time of writing) # Check that env variables have been set correctly: @@ -8,7 +8,7 @@ # export R_LIBS=~/build/revdeplib/ # export _R_CHECK_FORCE_SUGGESTS_=false stopifnot(identical(length(.libPaths()), 2L)) # revdeplib (writeable by me) and the pre-installed recommended R library (sudo writeable) -stopifnot(identical(file.info(.libPaths())[,"uname"], c(as.vector(Sys.info()["user"]), "root"))) +stopifnot(identical(file.info(.libPaths())[,"uname"], rep(as.vector(Sys.info()["user"]), 2))) # 2nd one is root when using default R rather than Rdevel stopifnot(identical(.libPaths()[1], getwd())) stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"false")) options(repos = c("CRAN"=c("http://cloud.r-project.org"))) @@ -18,10 +18,10 @@ options(repos = c("CRAN"=c("http://cloud.r-project.org"))) # and BiocManager::install()) will call this script again recursively. Sys.unsetenv("R_PROFILE_USER") -system("sudo R -e \"utils::update.packages('/usr/lib/R/library', ask=FALSE, checkBuilt=TRUE)\"") +system(paste0("~/build/R-devel/bin/R -e \"utils::update.packages('",.libPaths()[2],"', ask=FALSE, checkBuilt=TRUE)\"")) require(utils) # only base is loaded when R_PROFILE_USER runs -update.packages(ask=FALSE, checkBuilt=TRUE) +update.packages(ask=FALSE, checkBuilt=FALSE) # if package not found on mirror, try manually a different one: # install.packages("", repos="http://cran.stat.ucla.edu/") # update.packages(ask=FALSE) # a repeat sometimes does more, keep repeating until none diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c7698a51a..43ddf537a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -309,7 +309,7 @@ integration: # merging all artifacts to produce single R repository and summarie # web/checks/check_results_$pkg.html - Rscript -e 'check.index("data.table", names(test.jobs))' # pkgdown merge - - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); if (length(f<-common_files("pkgdown","bus/integration/cran"))) message(paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0(" ", f)), collapse="\n")); q("no")' + - Rscript -e 'common_files<-function(path1, path2) intersect(list.files(path1, all.files=TRUE, no..=TRUE), list.files(path2, all.files=TRUE, no..=TRUE)); msg = if (length(f<-common_files("pkgdown","bus/integration/cran"))) paste(c("Following artifacts will be overwritten by pkgdown artifacts:", paste0(" ", f)), collapse="\n") else "No overlapping files from pkgdown artifacts"; message(msg); q("no")' - mv pkgdown/* bus/integration/cran/ # cleanup artifacts from other jobs - mkdir tmpbus diff --git a/NEWS.md b/NEWS.md index f53b8fa6e..adc1c4836 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ ## NOTES +1. Links in the manual were creating warnings when installing HTML, [#4000](https://github.com/Rdatatable/data.table/issues/4000). Thanks to Morgan Jacob. + # data.table [v1.12.6](https://github.com/Rdatatable/data.table/milestone/18?closed=1) (18 Oct 2019) diff --git a/README.md b/README.md index ab4cf7f56..d4510ac6c 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,9 @@ --- -**Tuesday 22nd October 2019
+**26 December 2019
+Efficiency in data processing: data.table basics - Jan Gorecki, [Mumbai R@IISA 2019](https://r-iisa2019.rbind.io/)**
-Matt Dowle will be in New York for [H2O World](https://www.h2o.ai/h2oworldnewyork/).
-Please Ask-Me-Anything starting now: click http://sli.do and enter event code "askmattdowle".
-I'll answer the most voted questions during my session: https://h2o.ai/h2oworldny-livestream-reg** --- @@ -50,18 +48,10 @@ I'll answer the most voted questions during my session: https://h2o.ai/h2oworldn ## Installation -``` r -install.packages("data.table") -``` - -### Development version - ```r -install.packages("data.table", repos="https://Rdatatable.gitlab.io/data.table") -``` +install.packages("data.table") -or update only if newer revision is available -```r +# latest development version: data.table::update.dev.pkg() ``` @@ -92,6 +82,7 @@ DT[Petal.Width > 1.0, mean(Petal.Length), by = Species] * [Introduction to data.table](https://cloud.r-project.org/web/packages/data.table/vignettes/datatable-intro.html) vignette * [Getting started](https://github.com/Rdatatable/data.table/wiki/Getting-started) wiki page +* [Examples](https://rdatatable.gitlab.io/data.table/reference/data.table.html#examples) produced by `example(data.table)` ### Cheatsheets diff --git a/_pkgdown.yml b/_pkgdown.yml index 150d94a77..6d2ef397d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -7,9 +7,14 @@ template: development: version_tooltip: "Development version" +home: + links: + - text: CRAN-like website + href: web/packages/data.table/index.html + navbar: structure: - left: [home, introduction, reference, articles, news, benchmarks] + left: [home, introduction, articles, news, benchmarks, presentations, communityarticles, reference] right: [github] components: home: @@ -18,9 +23,6 @@ navbar: introduction: text: Introduction href: articles/datatable-intro.html - reference: - text: Manual - href: reference/index.html articles: text: Vignettes menu: @@ -43,11 +45,20 @@ navbar: - text: "Benchmarking data.table" href: articles/datatable-benchmarking.html news: - text: Changelog + text: News href: news/index.html benchmarks: text: Benchmarks href: https://h2oai.github.io/db-benchmark + presentations: + text: Presentations + href: https://github.com/Rdatatable/data.table/wiki/Presentations + communityarticles: + text: Articles + href: https://github.com/Rdatatable/data.table/wiki/Articles + reference: + text: Manual + href: reference/index.html github: icon: fab fa-github fa-lg href: https://github.com/Rdatatable/data.table diff --git a/man/data.table.Rd b/man/data.table.Rd index 1170813b6..b0a4037d3 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -231,7 +231,7 @@ column called \code{"keep"} containing \code{TRUE} and this is correct behaviour \seealso{ \code{\link{special-symbols}}, \code{\link{data.frame}}, \code{\link{[.data.frame}}, \code{\link{as.data.table}}, \code{\link{setkey}}, \code{\link{setorder}}, \code{\link{setDT}}, \code{\link{setDF}}, \code{\link{J}}, \code{\link{SJ}}, \code{\link{CJ}}, \code{\link{merge.data.table}}, \code{\link{tables}}, \code{\link{test.data.table}}, \code{\link{IDateTime}}, \code{\link{unique.data.table}}, \code{\link{copy}}, \code{\link{:=}}, \code{\link{setalloccol}}, \code{\link{truelength}}, \code{\link{rbindlist}}, \code{\link{setNumericRounding}}, \code{\link{datatable-optimize}}, \code{\link{fsetdiff}}, \code{\link{funion}}, \code{\link{fintersect}}, \code{\link{fsetequal}}, \code{\link{anyDuplicated}}, \code{\link{uniqueN}}, \code{\link{rowid}}, \code{\link{rleid}}, \code{\link{na.omit}}, \code{\link{frank}} } \examples{ \dontrun{ -example(data.table) # to run these examples at the prompt +example(data.table) # to run these examples yourself } DF = data.frame(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9) DT = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9) diff --git a/man/fread.Rd b/man/fread.Rd index fc147fa0e..03b288ebc 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -42,11 +42,11 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir() \item{skip}{ If 0 (default) start on the first line and from there finds the first row with a consistent number of columns. This automatically avoids irregular header information before the column names row. \code{skip>0} means ignore the first \code{skip} rows manually. \code{skip="string"} searches for \code{"string"} in the file (e.g. a substring of the column names row) and starts on that line (inspired by read.xls in package gdata). } \item{select}{ A vector of column names or numbers to keep, drop the rest. \code{select} may specify types too in the same way as \code{colClasses}; i.e., a vector of \code{colname=type} pairs, or a \code{list} of \code{type=col(s)} pairs. In all forms of \code{select}, the order that the columns are specified determines the order of the columns in the result. } \item{drop}{ Vector of column names or numbers to drop, keep the rest. } - \item{colClasses}{ As in \code{\link[utils]{read.csv}}; i.e., an unnamed vector of types corresponding to the columns in the file, or a named vector specifying types for a subset of the columns by name. The default, \code{NULL} means types are inferred from the data in the file. Further, \code{data.table} supports a named \code{list} of vectors of column names \emph{or numbers} where the \code{list} names are the class names; see examples. The \code{list} form makes it easier to set a batch of columns to be a particular class. When column numbers are used in the \code{list} form, they refer to the column number in the file not the column number after \code{select} or \code{drop} has been applied. + \item{colClasses}{ As in \code{\link[utils:read.table]{utils::read.csv}}; i.e., an unnamed vector of types corresponding to the columns in the file, or a named vector specifying types for a subset of the columns by name. The default, \code{NULL} means types are inferred from the data in the file. Further, \code{data.table} supports a named \code{list} of vectors of column names \emph{or numbers} where the \code{list} names are the class names; see examples. The \code{list} form makes it easier to set a batch of columns to be a particular class. When column numbers are used in the \code{list} form, they refer to the column number in the file not the column number after \code{select} or \code{drop} has been applied. If type coercion results in an error, introduces \code{NA}s, or would result in loss of accuracy, the coercion attempt is aborted for that column with warning and the column's type is left unchanged. If you really desire data loss (e.g. reading \code{3.14} as \code{integer}) you have to truncate such columns afterwards yourself explicitly so that this is clear to future readers of your code. } - \item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{base::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". } - \item{dec}{ The decimal separator as in \code{base::read.csv}. If not "." (default) then usually ",". See details. } + \item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{utils::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". } + \item{dec}{ The decimal separator as in \code{utils::read.csv}. If not "." (default) then usually ",". See details. } \item{col.names}{ A vector of optional names for the variables (columns). The default is to use the header column if present or detected, or if not "V" followed by the column number. This is applied after \code{check.names} and before \code{key} and \code{index}. } \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.} \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } @@ -63,7 +63,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir() \item{keepLeadingZeros}{If TRUE a column containing numeric data with leading zeros will be read as character, otherwise leading zeros will be removed and converted to numeric.} \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } - \item{tmpdir}{ Directory to use as the \code{tmpdir} argument for any \code{tempfile} calls, e.g. when the input is a URL or a shell command. The default is \code{tempdir()} which can be controlled by setting \code{TMPDIR} before starting the R session; see \code{\link[base]{tempdir}}. } + \item{tmpdir}{ Directory to use as the \code{tmpdir} argument for any \code{tempfile} calls, e.g. when the input is a URL or a shell command. The default is \code{tempdir()} which can be controlled by setting \code{TMPDIR} before starting the R session; see \code{\link[base:tempfile]{base::tempdir}}. } } \details{ diff --git a/man/like.Rd b/man/like.Rd index de6edae0f..4eadb98a8 100644 --- a/man/like.Rd +++ b/man/like.Rd @@ -22,13 +22,13 @@ vector \%flike\% pattern \item{fixed}{ \code{logical}; should \code{pattern} be interpreted as a literal string (i.e., ignoring regular expressions)? } } \details{ - Internally, \code{like} is essentially a wrapper around \code{\link[base]{grepl}}, except that it is smarter about handling \code{factor} input (\code{base::grep} uses slow \code{as.character} conversion). + Internally, \code{like} is essentially a wrapper around \code{\link[base:grep]{base::grepl}}, except that it is smarter about handling \code{factor} input (\code{base::grep} uses slow \code{as.character} conversion). } \value{ Logical vector, \code{TRUE} for items that match \code{pattern}. } \note{ Current implementation does not make use of sorted keys. } -\seealso{ \code{\link[base]{grepl}} } +\seealso{ \code{\link[base:grep]{base::grepl}} } \examples{ DT = data.table(Name=c("Mary","George","Martha"), Salary=c(2,3,4)) DT[Name \%like\% "^Mar"] diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd index c057f0433..14564fb54 100644 --- a/man/openmp-utils.Rd +++ b/man/openmp-utils.Rd @@ -19,7 +19,7 @@ A length 1 \code{integer}. The old value is returned by \code{setDTthreads} so you can store that prior value and pass it to \code{setDTthreads()} again after the section of your code where you control the number of threads. } \details{ - \code{data.table} automatically switches to single threaded mode upon fork (the mechanism used by \code{\link[parallel]{mclapply}} and the foreach package). Otherwise, nested parallelism would very likely overload your CPUs and result in much slower execution. As \code{data.table} becomes more parallel internally, we expect explicit user parallelism to be needed less often. The \code{restore_after_fork} option controls what happens after the explicit fork parallelism completes. It needs to be at C level so it is not a regular R option using \code{options()}. By default \code{data.table} will be multi-threaded again; restoring the prior setting of \code{getDTthreads()}. But problems have been reported in the past on Mac with Intel OpenMP libraries whereas success has been reported on Linux. If you experience problems after fork, start a new R session and change the default behaviour by calling \code{setDTthreads(restore_after_fork=FALSE)} before retrying. Please raise issues on the data.table GitHub issues page. + \code{data.table} automatically switches to single threaded mode upon fork (the mechanism used by \code{parallel::mclapply} and the foreach package). Otherwise, nested parallelism would very likely overload your CPUs and result in much slower execution. As \code{data.table} becomes more parallel internally, we expect explicit user parallelism to be needed less often. The \code{restore_after_fork} option controls what happens after the explicit fork parallelism completes. It needs to be at C level so it is not a regular R option using \code{options()}. By default \code{data.table} will be multi-threaded again; restoring the prior setting of \code{getDTthreads()}. But problems have been reported in the past on Mac with Intel OpenMP libraries whereas success has been reported on Linux. If you experience problems after fork, start a new R session and change the default behaviour by calling \code{setDTthreads(restore_after_fork=FALSE)} before retrying. Please raise issues on the data.table GitHub issues page. The number of logical CPUs is determined by the OpenMP function \code{omp_get_num_procs()} whose meaning may vary across platforms and OpenMP implementations. \code{setDTthreads()} will not allow more than this limit. Neither will it allow more than \code{omp_get_thread_limit()} nor the current value of \code{Sys.getenv("OMP_THREAD_LIMIT")}. Note that CRAN's daily test system (results for data.table \href{https://cran.r-project.org/web/checks/check_results_data.table.html}{here}) sets \code{OMP_THREAD_LIMIT} to 2 and should always be respected; e.g., if you have written a package that uses data.table and your package is to be released on CRAN, you should not change \code{OMP_THREAD_LIMIT} in your package to a value greater than 2.