From 3ba1c24aec5fe60bd1612dab4a550903aa2d0d8e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 8 Nov 2022 17:41:25 -0500 Subject: [PATCH] test.data.table(memtest=TRUE) (#5515) --- NEWS.md | 2 + R/test.data.table.R | 103 ++++++++++++++++++----------------------- inst/tests/tests.Rraw | 9 ++-- man/test.data.table.Rd | 4 +- 4 files changed, 55 insertions(+), 63 deletions(-) diff --git a/NEWS.md b/NEWS.md index 57214ade8..15bf7e8ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -599,6 +599,8 @@ 15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). +16. `test.data.table()` no longer creates `DT` in `.GlobalEnv` and gains `memtest=` for use on Linux to report which tests use the most memory. + # data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1) (17 Oct 2022) diff --git a/R/test.data.table.R b/R/test.data.table.R index 298fc34c1..1c212f58b 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -1,8 +1,12 @@ -test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent) { +test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, + memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0)) { stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress)) + memtest = as.integer(memtest) + stopifnot(length(memtest)==1L, memtest %in% 0:2) if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # package developer # nocov start + dev = TRUE if ("package:data.table" %chin% search()) stopf("data.table package is loaded. Unload or start a fresh R session.") rootdir = if (pkg!="." && pkg %chin% dir()) file.path(getwd(), pkg) else Sys.getenv("PROJ_PATH") subdir = file.path("inst","tests") @@ -10,6 +14,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # nocov end } else { # i) R CMD check and ii) user running test.data.table() + dev = FALSE rootdir = getNamespaceInfo("data.table","path") subdir = "tests" env = new.env(parent=parent.env(.GlobalEnv)) # when user runs test.data.table() we don't want their variables in .GlobalEnv affecting tests, #3705 @@ -112,14 +117,18 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F assign("whichfail", NULL, envir=env) assign("started.at", proc.time(), envir=env) assign("lasttime", proc.time()[3L], envir=env) # used by test() to attribute time inbetween tests to the next test - assign("timings", data.table( ID = seq_len(9999L), time=0.0, nTest=0L ), envir=env) # test timings aggregated to integer id - assign("memtest", as.logical(Sys.getenv("TEST_DATA_TABLE_MEMTEST", "FALSE")), envir=env) + assign("timings", data.table( ID = seq_len(9999L), time=0.0, nTest=0L, RSS=0.0 ), envir=env) # test timings aggregated to integer id + assign("memtest", memtest, envir=env) assign("filename", fn, envir=env) - assign("inittime", as.integer(Sys.time()), envir=env) # keep measures from various test.data.table runs assign("showProgress", showProgress, envir=env) owd = setwd(tempdir()) # ensure writeable directory; e.g. tests that plot may write .pdf here depending on device option and/or batch mode; #5190 on.exit(setwd(owd)) + + if (memtest) { + catf("\n***\n*** memtest=%d. This should be the first task in a fresh R session for best results. Ctrl-C now if not.\n***\n\n", memtest) + if (is.na(ps_mem())) stopf("memtest intended for Linux. Step through ps_mem() to see what went wrong.") + } err = try(sys.source(fn, envir=env), silent=silent) @@ -174,42 +183,27 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F } # There aren't any errors, so we can use up 11 lines for the timings table - timings = env$timings - DT = head(timings[-1L][order(-time)], 10L) # exclude id 1 as in dev that includes JIT - if ((x<-sum(timings[["nTest"]])) != ntest) { - warningf("Timings count mismatch: %d vs %d", x, ntest) # nocov + nTest = RSS = NULL # to avoid 'no visible binding' note + timings = env$timings[nTest>0] + if (!memtest) { + ans = head(timings[if (dev) -1L else TRUE][order(-time)], 10L)[,RSS:=NULL] # exclude id 1 in dev as that includes JIT + if ((x<-sum(timings[["nTest"]])) != ntest) { + warningf("Timings count mismatch: %d vs %d", x, ntest) # nocov + } + catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-ans[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) + print(ans, class=FALSE) + } else { + y = head(order(-diff(timings$RSS)), 10L) + ans = timings[, diff:=c(NA,round(diff(RSS),1))][y+1L][,time:=NULL] # time is distracting and influenced by gc() calls; just focus on RAM usage here + catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n") + print(ans, class=FALSE) + plot(timings$RSS, main=basename(fn), ylab="RSS (MB)") } - catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) - print(DT, class=FALSE) catf("All %d tests (last %.8g) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) - - ## this chunk requires to include new suggested deps: graphics, grDevices - #memtest.plot = function(.inittime) { - # if (!all(requireNamespace(c("graphics","grDevices"), quietly=TRUE))) return(invisible()) - # inittime=PS_rss=GC_used=GC_max_used=NULL - # m = fread("memtest.csv")[inittime==.inittime] - # if (nrow(m)) { - # ps_na = allNA(m[["PS_rss"]]) # OS with no 'ps -o rss R' support - # grDevices::png("memtest.png") - # p = graphics::par(mfrow=c(if (ps_na) 2 else 3, 2)) - # if (!ps_na) { - # m[, graphics::plot(test, PS_rss, pch=18, xlab="test num", ylab="mem MB", main="ps -o rss R")] - # m[, graphics::plot(timestamp, PS_rss, type="l", xlab="timestamp", ylab="mem MB", main="ps -o rss R")] - # } - # m[, graphics::plot(test, GC_used, pch=18, xlab="test num", ylab="mem MB", main="gc used")] - # m[, graphics::plot(timestamp, GC_used, type="l", xlab="timestamp", ylab="mem MB", main="gc used")] - # m[, graphics::plot(test, GC_max_used, pch=18, xlab="test num", ylab="mem MB", main="gc max used")] - # m[, graphics::plot(timestamp, GC_max_used, type="l", xlab="timestamp", ylab="mem MB", main="gc max used")] - # graphics::par(p) - # grDevices::dev.off() - # } else { - # warningf("test.data.table runs with memory testing but did not collect any memory statistics.") - # } - #} - #if (memtest<-get("memtest", envir=env)) memtest.plot(get("inittime", envir=env)) - - invisible(nfail==0L) + ans = nfail==0L + attr(ans, "timings") = timings # as attr to not upset callers who expect a TRUE/FALSE result + invisible(ans) } # nocov start @@ -235,17 +229,16 @@ INT = function(...) { as.integer(c(...)) } # utility used in tests.Rraw ps_mem = function() { # nocov start - cmd = sprintf("ps -o rss %s | tail -1", Sys.getpid()) - ans = tryCatch(as.numeric(system(cmd, intern=TRUE, ignore.stderr=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) - stopifnot(length(ans)==1L) # extra check if other OSes would not handle 'tail -1' properly for some reason - # returns RSS memory occupied by current R process in MB rounded to 1 decimal places (as in gc), ps already returns KB - c("PS_rss"=round(ans / 1024, 1L)) + cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB + ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_) + if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case + round(ans / 1024, 1L) # return MB # nocov end } gc_mem = function() { # nocov start - # gc reported memory in MB + # gc reports memory in MB m = apply(gc()[, c(2L, 4L, 6L)], 2L, sum) names(m) = c("GC_used", "GC_gc_trigger", "GC_max_used") m @@ -278,16 +271,19 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no lasttime = get("lasttime", parent.frame()) timings = get("timings", parent.frame()) memtest = get("memtest", parent.frame()) - inittime = get("inittime", parent.frame()) filename = get("filename", parent.frame()) foreign = get("foreign", parent.frame()) showProgress = get("showProgress", parent.frame()) - time = nTest = NULL # to avoid 'no visible binding' note + time = nTest = RSS = NULL # to avoid 'no visible binding' note if (num>0) on.exit( { - now = proc.time()[3L] - took = now-lasttime # so that prep time between tests is attributed to the following test - assign("lasttime", now, parent.frame(), inherits=TRUE) - timings[ as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE ] + took = proc.time()[3L]-lasttime # so that prep time between tests is attributed to the following test + timings[as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE] + if (memtest) { + if (memtest==1L) gc() # see #5515 for before/after + timings[as.integer(num), RSS:=max(ps_mem(),RSS), verbose=FALSE] + if (memtest==2L) gc() + } + assign("lasttime", proc.time()[3L], parent.frame(), inherits=TRUE) # after gc() to exclude gc() time from next test when memtest } ) if (showProgress) # \r can't be in gettextf msg @@ -300,7 +296,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # not be flushed to the output upon segfault, depending on OS). } else { # not `test.data.table` but developer running tests manually; i.e. `cc(F); test(...)` - memtest = FALSE # nocov + memtest = 0L # nocov filename = NA_character_ # nocov foreign = FALSE # nocov ; assumes users of 'cc(F); test(...)' has LANGUAGE=en showProgress = FALSE # nocov @@ -330,9 +326,6 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no actual$message <<- c(actual$message, conditionMessage(m)) m } - if (memtest) { - timestamp = as.numeric(Sys.time()) # nocov - } if (is.null(output) && is.null(notOutput)) { x = suppressMessages(withCallingHandlers(tryCatch(x, error=eHandler), warning=wHandler, message=mHandler)) # save the overhead of capture.output() since there are a lot of tests, often called in loops @@ -340,10 +333,6 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no } else { out = capture.output(print(x <- suppressMessages(withCallingHandlers(tryCatch(x, error=eHandler), warning=wHandler, message=mHandler)))) } - if (memtest) { - mem = as.list(c(inittime=inittime, filename=basename(filename), timestamp=timestamp, test=num, ps_mem(), gc_mem())) # nocov - fwrite(mem, "memtest.csv", append=TRUE, verbose=FALSE) # nocov - } fail = FALSE if (.test.data.table && num>0) { if (num0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") - DTfun = DT # just in dev-mode, DT() gets overwritten in .GlobalEnv by DT objects here in tests.Rraw; we restore DT() in test 2212 } else { require(data.table) # Make symbols to the installed version's ::: so that we can i) test internal-only not-exposed R functions @@ -163,7 +162,8 @@ base_messages = list( ########################## test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class") -test(1.2, tables(silent=TRUE), data.table(NAME="timings", NROW=9999L, NCOL=3L, MB=0, COLS=list(c("ID","time","nTest")), KEY=list(NULL))) +test(1.2, tables(silent=TRUE)[,.(NAME,NROW,MB)], # memtest=TRUE adds some columns so exclude NCOL and COLS here + data.table(NAME="timings", NROW=9999L, MB=0)) TESTDT = data.table(a=as.integer(c(1,3,4,4,4,4,7)), b=as.integer(c(5,5,6,6,9,9,2)), v=1:7) setkey(TESTDT,a,b) @@ -15325,10 +15325,10 @@ test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans) # source() printing edge case; #2369 setup = c('DT = data.table(a = 1)') writeLines(c(setup, 'DT[ , a := 1]'), tmp<-tempfile()) -test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo = TRUE)), fixed = TRUE))) +test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo=TRUE, local=TRUE)), fixed=TRUE))) # local= #5514 ## test force-printing still works writeLines(c(setup, 'DT[ , a := 1][]'), tmp) -test(2036.2, source(tmp, echo = TRUE), output = "1:\\s+1") +test(2036.2, source(tmp, echo=TRUE, local=TRUE), output="1:\\s+1") # more helpful guidance when assigning before setDT() after readRDS(); #1729 DT = data.table(a = 1:3) @@ -18317,7 +18317,6 @@ for (col in c("a","b","c")) { # DT() functional form, #4872 #5106 #5107 #5129 if (base::getRversion() >= "4.1.0") { # we have to EVAL "|>" here too otherwise this tests.Rraw file won't parse in R<4.1.0 - if (exists("DTfun")) DT=DTfun # just in dev-mode restore DT() in .GlobalEnv as DT object overwrote it in tests above droprn = function(df) { rownames(df)=NULL; df } # TODO: could retain rownames where droprn is currently used below test(2212.011, EVAL("mtcars |> DT(mpg>20, .(mean_hp=round(mean(hp),2)), by=cyl)"), data.frame(cyl=c(6,4), mean_hp=c(110.0, 82.64))) diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index ba0fe25f9..2df2a3284 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -7,7 +7,8 @@ \usage{ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", silent = FALSE, - showProgress = interactive() && !silent) + showProgress = interactive() && !silent, + memtest = Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0)) } \arguments{ \item{script}{ Run arbitrary R test script. } @@ -15,6 +16,7 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", \item{pkg}{ Root directory name under which all package content (ex: DESCRIPTION, src/, R/, inst/ etc..) resides. Used only in \emph{dev-mode}. } \item{silent}{ Controls what happens if a test fails. Like \code{silent} in \code{\link{try}}, \code{TRUE} causes the error message to be suppressed and \code{FALSE} to be returned, otherwise the error is returned. } \item{showProgress}{ Output 'Running test ...\\r' at the start of each test? } +\item{memtest}{ Measure and report memory usage of tests (1:gc before ps, 2:gc after ps) rather than time taken (0) by default. Intended for and tested on Linux. See PR #5515 for more details. } } \details{ Runs a series of tests. These can be used to see features and examples of usage, too. Running test.data.table will tell you the full location of the test file(s) to open.