Skip to content
Permalink
master
Go to file
 
 
Cannot retrieve contributors at this time
17160 lines (15583 sloc) 959 KB
require(methods)
if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
if (!identical(suppressWarnings(packageDescription("data.table")), NA)) {
remove.packages("data.table")
stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n")
}
if ((tt<-compiler::enableJIT(-1))>0)
cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="")
} else {
require(data.table)
# Make symbols to the installed version's ::: so that we can i) test internal-only not-exposed R functions
# in the test suite when user runs test.data.table() from installed package AND ii) so that in dev the same
# tests can be used but in dev they test the package in .GlobalEnv. If we used ::: throughout tests, that
# would pick up the installed version and in dev you'd have to reinstall every time which slows down dev.
# NB: The string "data.table::" (which covers "data.table:::" too) should exist nowhere else in this file
# other than here inside this branch.
all.equal.data.table = data.table:::all.equal.data.table
allNA = data.table:::allNA
any_na = data.table:::any_na
as.data.table.array = data.table:::as.data.table.array
as.IDate.default = data.table:::as.IDate.default
as.ITime.default = data.table:::as.ITime.default
binary = data.table:::binary
bmerge = data.table:::bmerge
brackify = data.table:::brackify
Ctest_dt_win_snprintf = data.table:::Ctest_dt_win_snprintf
chmatchdup = data.table:::chmatchdup
compactprint = data.table:::compactprint
cube.data.table = data.table:::cube.data.table
dcast.data.table = data.table:::dcast.data.table
forder = data.table:::forder
forderv = data.table:::forderv
format.data.table = data.table:::format.data.table
getdots = data.table:::getdots
groupingsets.data.table = data.table:::groupingsets.data.table
guess = data.table:::guess
INT = data.table:::INT
is_na = data.table:::is_na
is.sorted = data.table:::is.sorted
isReallyReal = data.table:::isReallyReal
is_utc = data.table:::is_utc
melt.data.table = data.table:::melt.data.table # for test 1953.4
null.data.table = data.table:::null.data.table
print.data.table = data.table:::print.data.table
replace_dot_alias = data.table:::replace_dot_alias
rollup.data.table = data.table:::rollup.data.table
selfrefok = data.table:::selfrefok
setcoalesce = data.table:::setcoalesce
setdiff_ = data.table:::setdiff_
setreordervec = data.table:::setreordervec
shallow = data.table:::shallow # until exported
.shallow = data.table:::.shallow
split.data.table = data.table:::split.data.table
test = data.table:::test
uniqlengths = data.table:::uniqlengths
uniqlist = data.table:::uniqlist
which_ = data.table:::which_
which.first = data.table:::which.first
which.last = data.table:::which.last
`-.IDate` = data.table:::`-.IDate`
# Also, for functions that are masked by other packages, we need to map the data.table one. Or else,
# the other package's function would be picked up. As above, we only need to do this because we desire
# to develop in .GlobalEnv with cc().
# This should be retained even if these packages are removed from Suggests, because the test() in this file
# checks against a data.table result which needs the data.table one to run. Otherwise the user can be
# sure by using :: themselves.
# masked by which package?
# =================================
setattr = data.table::setattr # bit
shift = data.table::shift # IRanges, GenomicRanges
between = data.table::between # plm
second = data.table::second # S4Vectors
dcast = data.table::dcast # reshape2
melt = data.table::melt # reshape2
last = data.table::last # xts
first = data.table::first # xts, S4Vectors
copy = data.table::copy # bit64 v4; bit64 offered to rename though so this is just in case bit64 unoffers
}
# Load optional Suggests packages, which are tested by Travis for code coverage, and on CRAN
# The reason for inclusion here is stated next to each package
sugg = c(
"bit64", # if big integers are detected in file, fread reads them as bit64::integer64 if installed (warning if not)
"xts", # we have xts methods in R/xts.R
"nanotime", # fwrite looks for the 'nanotime' class name at C level (but we have our own writer in C, though)
"R.utils", # for fread to accept .gz and .bz2 files directly
"yaml" # for fread's yaml argument (csvy capability)
# zoo # In DESCRIPTION:Suggests otherwise R CMD check warning: '::' or ':::' import not declared from: 'zoo'; it is tested in other.Rraw though
)
for (s in sugg) {
assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages(require(s, character.only=TRUE))))
if (!loaded) cat("\n**** Suggested package",s,"is not installed. Tests using it will be skipped.\n\n")
}
test_longdouble = isTRUE(capabilities()["long.double"]) && identical(as.integer(.Machine$longdouble.digits), 64L)
if (!test_longdouble) {
cat("\n**** Full long double accuracy is not available. Tests using this will be skipped.\n\n")
# e.g. under valgrind, longdouble.digits==53; causing these to fail: 1262, 1729.04, 1729.08, 1729.09, 1729.11, 1729.13, 1830.7; #4639
}
##########################
test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class")
test(1.2, tables(silent=TRUE), data.table(NAME="timings", NROW=9999L, NCOL=3L, MB=0, COLS=list(c("ID","time","nTest")), KEY=list(NULL)))
TESTDT = data.table(a=as.integer(c(1,3,4,4,4,4,7)), b=as.integer(c(5,5,6,6,9,9,2)), v=1:7)
setkey(TESTDT,a,b)
# i.e. a b v
# [1,] 1 5 1
# [2,] 3 5 2
# [3,] 4 6 3
# [4,] 4 6 4
# [5,] 4 9 5
# [6,] 4 9 6
# [7,] 7 2 7
test(2.1, TESTDT[SJ(4,6),v,mult="first"], 3L)
test(2.2, TESTDT[SJ(4,6),v,mult="last"], 4L)
test(3, TESTDT[SJ(c(4,4,4),c(6,6,7)),v,mult="last",roll=TRUE], INT(4,4,4))
test(4, TESTDT[SJ(c(4,4,4),c(9,9,10)),v,mult="last",roll=TRUE], INT(6,6,6))
test(5, TESTDT[SJ(c(4,4,4),c(6,6,7)),v,mult="last",roll=TRUE,rollends=FALSE], INT(4,4,4))
test(6, TESTDT[SJ(c(4,4,4),c(9,9,10)),v,mult="last",roll=TRUE,rollends=FALSE], INT(6,6,NA))
test(7, TESTDT[SJ(c(4,4,4),c(9,9,10)),v,mult="first",roll=TRUE,rollends=FALSE], INT(5,5,NA))
test(8, TESTDT[SJ(c(-9,1,4,4,8),c(1,4,4,10,1)),v], INT(NA,NA,NA,NA,NA))
test(9, TESTDT[SJ(c(-9,1,4,4,8),c(1,4,4,10,1)),v,roll=TRUE], INT(NA,NA,NA,6,NA))
test(10, TESTDT[SJ(c(-9,1,4,4,8),c(1,4,4,10,1)),v,roll=TRUE,rollends=FALSE], INT(NA,NA,NA,NA,NA))
test(11, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="first"], INT(NA,NA,3,3,NA,7,NA))
test(12, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="first",roll=TRUE], INT(NA,1,3,3,6,7,7))
test(13, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="last"], INT(NA,NA,6,6,NA,7,NA))
test(14, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="last",roll=TRUE], INT(NA,1,6,6,6,7,7))
test(15, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="last",nomatch=0], INT(6,6,7))
test(16, TESTDT[SJ(c(4)),v], INT(3,4,5,6))
#test(17, suppressWarnings(TESTDT[SJ(c(4,4)),v,mult="all",incbycols=FALSE][[1]]), INT(3:6,3:6))
test(18.1, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",nomatch=0,by=.EACHI][[2]], INT(3:6))
test(18.2, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",nomatch=NA], INT(NA,NA,3:6,NA))
test(19.1, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,nomatch=0], INT(1,3:6,7))
test(19.2, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,nomatch=NA], INT(NA,1,3:6,7))
test(20.1, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=0], INT(1,3:6))
test(20.2, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=NA], INT(NA,1,3:6,NA))
test(21.1, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=0], INT(1,3:4))
test(21.2, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=NA, allow.cartesian=TRUE], INT(NA,1,NA,3:4,NA,NA,NA))
test(22.1, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=0], INT(1,3:4,4,6))
test(22.2, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=NA, allow.cartesian=TRUE], INT(NA,1,NA,3:4,4,6,NA))
test(23.1, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=0], INT(1,3:4,4))
test(23.2, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=NA,allow.cartesian=TRUE], INT(NA,1,NA,3:4,4,NA,NA))
test(24.1, TESTDT[SJ(c(1,NA,4,NA,NA,4,4),c(5,5,6,6,7,9,10)),v,mult="all",roll=TRUE,nomatch=0], INT(1,3:4,5:6,6))
test(24.2, TESTDT[SJ(c(1,NA,4,NA,NA,4,4),c(5,5,6,6,7,9,10)),v,mult="all",roll=TRUE,nomatch=NA,allow.cartesian=TRUE], INT(NA,NA,NA,1,3:4,5:6,6))
# Note that the NAs get sorted to the beginning by the SJ().
# i.e. a b v (same test matrix, repeating here for easier reading of the test cases below)
# [1,] 1 5 1
# [2,] 3 5 2
# [3,] 4 6 3
# [4,] 4 6 4
# [5,] 4 9 5
# [6,] 4 9 6
# [7,] 7 2 7
test(25, TESTDT[SJ(4,6),v,mult="first"], 3L)
test(26, TESTDT[SJ(4,6),v,mult="last"], 4L)
test(27, TESTDT[J(c(4,4,4),c(7,6,6)),v,mult="last",roll=TRUE], INT(4,4,4))
test(28, TESTDT[J(c(4,4,4),c(10,9,9)),v,mult="last",roll=TRUE], INT(6,6,6))
test(29, TESTDT[J(c(4,4,4),c(7,6,6)),v,mult="last",roll=TRUE,rollends=FALSE], INT(4,4,4))
test(30, TESTDT[J(c(4,4,4),c(10,9,9)),v,mult="last",roll=TRUE,rollends=FALSE], INT(NA,6,6))
test(31, TESTDT[J(c(4,4,4),c(10,9,9)),v,mult="first",roll=TRUE,rollends=FALSE], INT(NA,5,5))
test(32, TESTDT[J(c(8,1,4,4,-9),c(1,4,4,10,1)),v], INT(NA,NA,NA,NA,NA))
test(33, TESTDT[J(c(8,1,4,4,-9),c(1,4,4,10,1)),v,roll=TRUE], INT(NA,NA,NA,6,NA))
test(34, TESTDT[J(c(8,1,4,4,-9),c(1,4,7,10,1)),v,roll=TRUE,rollends=FALSE], INT(NA,NA,4,NA,NA))
test(35, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="first"], INT(NA,3,NA,NA,3,7,NA))
test(36, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="first",roll=TRUE], INT(6,3,NA,7,3,7,1))
test(37, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="last"], INT(NA,6,NA,NA,6,7,NA))
test(38, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="last",roll=TRUE], INT(6,6,NA,7,6,7,1))
test(39, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="last",nomatch=0], INT(6,6,7))
test(40, TESTDT[J(c(4)),v,mult="all"], INT(3,4,5,6))
test(41, TESTDT[J(c(4,4)),v,mult="all", allow.cartesian=TRUE], INT(3:6,3:6))
test(42.1, TESTDT[J(c(8,2,4,-3)),v,mult="all",nomatch=0], INT(3:6))
test(42.2, TESTDT[J(c(8,2,4,-3)),v,mult="all",nomatch=NA], INT(NA,NA,3:6,NA))
test(43.1, TESTDT[J(c(8,2,4,-3)),v,mult="all",roll=TRUE,nomatch=0], INT(7,1,3:6))
test(43.2, TESTDT[J(c(8,2,4,-3)),v,mult="all",roll=TRUE,nomatch=NA], INT(7,1,3:6,NA))
#test(44, suppressWarnings(TESTDT[J(c(8,4,2,-3)),v,mult="all",roll=TRUE,rollends=FALSE,incbycols=FALSE]), INT(3:6,1))
test(45.1, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=0], INT(1,3:4))
test(45.2, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=NA,allow.cartesian=TRUE], INT(NA,1,NA,3:4,NA,NA,NA))
test(46.1, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=0], INT(1,3:4,4,6))
test(46.2, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=NA,allow.cartesian=TRUE], INT(NA,1,NA,3:4,4,6,NA))
test(47.1, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=0], INT(1,3:4,4))
test(47.2, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=NA,allow.cartesian=TRUE], INT(NA,1,NA,3:4,4,NA,NA))
test(48.1, TESTDT[J(c(-9,NA,4,NA,1,4,4),c(1,5,9,6,5,9,10)),v,mult="all",roll=TRUE,nomatch=0], INT(5:6,1,5:6,6)) # this time the NAs stay where they are. Compare to test 24 above.
test(48.2, TESTDT[J(c(-9,NA,4,NA,1,4,4),c(1,5,9,6,5,9,10)),v,mult="all",roll=TRUE,nomatch=NA,allow.cartesian=TRUE], INT(NA,NA,5:6,NA,1,5:6,6))
test(49.1, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,nomatch=0], INT(3,4,1,2,7,3,4))
test(49.2, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,nomatch=NA,allow.cartesian=TRUE], INT(3,4,1,NA,NA,2,7,NA,3,4,NA))
test(50.1, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,mult="last",nomatch=0], INT(4,1,2,7,4))
test(50.2, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,mult="last",nomatch=NA], INT(4,1,NA,NA,2,7,NA,4,NA))
TESTDT[, a:=letters[a]]
setkey(TESTDT,a,b)
# i.e. a b v
# [1,] a 5 1
# [2,] c 5 2
# [3,] d 6 3
# [4,] d 6 4
# [5,] d 9 5
# [6,] d 9 6
# [7,] g 2 7
test(51.1, TESTDT[SJ(c("d","d","e","g"),c(6,7,1,2)),v,mult="all",roll=TRUE,nomatch=0], INT(3:4,4,7))
test(51.2, TESTDT[SJ(c("d","d","e","g"),c(6,7,1,2)),v,mult="all",roll=TRUE,nomatch=NA], INT(3:4,4,NA,7))
test(52.1, TESTDT[J(c("g","d","e","d"),c(6,6,1,2)),v,mult="all",roll=TRUE,nomatch=0], INT(7,3:4))
test(52.2, TESTDT[J(c("g","d","e","d"),c(6,6,1,2)),v,mult="all",roll=TRUE,nomatch=NA], INT(7,3:4,NA,NA))
TESTDT[, b:=letters[b]]
setkey(TESTDT,a,b)
# i.e.
# a b v
# [1,] a e 1
# [2,] c e 2
# [3,] d f 3
# [4,] d f 4
# [5,] d i 5
# [6,] d i 6
# [7,] g b 7
test(53, TESTDT[SJ(c("d","d","e","g"),c("f","g","a","b")),v,mult="last"], INT(4,NA,NA,7))
test(54, TESTDT[J(c("g","d","e","d"),c("b","g","a","f")),v,mult="last"], INT(7,NA,NA,4)) # this tests (d,g) ok even though there is an NA in last match in the roll.
test(55, TESTDT[SJ(c("d","d","e","g"),c("f","g","a","b")),v,mult="first"], INT(3,NA,NA,7))
test(56, TESTDT[J(c("g","d","e","d"),c("b","g","a","f")),v,mult="first"], INT(7,NA,NA,3))
test(57, TESTDT[J(c("g","d","d","d","e","d"),c("b","g","k","b","a","f")),v,roll=TRUE], INT(7,4,6,NA,NA,3,4))
# test 58 removed. Tested this failed (rolling join on factors) pre character columns, now works.
test(59, TESTDT[J(c("g","d","d","d","e","d"),c("b","g","k","b","a","f")),v,roll=TRUE,rollends=FALSE], INT(7,4,NA,NA,NA,3,4))
# test 60 removed. Tested this failed (rolling join on factors) pre character columns, now works.
# Tests 61-66 were testing sortedmatch which is now replaced by chmatch for characters, and removed
# for integers until needed.
# Test 67 removed. No longer use factors so debate/problem avoided.
# [.factor and c.factor are no longer present in data.table, not even hidden away
# X = factor(letters[1:10])
# test(67, levels(X[4:6]), letters[4:6])
test(68, "TESTDT" %in% tables(silent=TRUE)[,NAME]) # NAME is returned as a column in which we look for the string
test(69.1, "TESTDT" %in% tables(silent=TRUE)[,as.character(NAME)]) # an old test (from when NAME was factor) but no harm in keeping it
test(69.2, names(tables(silent=TRUE)), c("NAME","NROW","NCOL","MB","COLS","KEY"))
test(69.3, names(tables(silent=TRUE, mb=FALSE)), c("NAME","NROW","NCOL","COLS","KEY"))
test(69.4, names(tables(silent=TRUE, mb=FALSE, index=TRUE)),
c("NAME", "NROW", "NCOL", "COLS", "KEY", "INDICES"))
xenv = new.env() # to control testing tables()
xenv$DT = data.table(a = 1)
test(69.5, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 1 1 0 a.*Total: 0MB")
xenv$DT = data.table(A=1:2, B=3:4, C=5:6, D=7:8, E=9:10, F=11:12, G=13:14, H=15:16, key="A,D,F,G")
test(69.6, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 2 8 0 A,B,C,D,E,F,... A,D,F,G.*Total: 0MB")
rm(xenv)
test(69.7, tables(order.col='asdf'), error="not a column name of info")
a = "d"
# Variable Twister. a in this scope has same name as a inside DT scope.
# Aug 2010 : As a result of bug 1005, and consistency with 'j' and 'by' we now allow self joins (test 183) in 'i'.
test(70, TESTDT[eval(J(a)),v,by=.EACHI], data.table(a="d",v=3:6,key="a")) # the eval() enabled you to use the 'a' in the calling scope, not 'a' in the TESTDT. TO DO: document this.
test(71, TESTDT[eval(SJ(a)),v,by=.EACHI], data.table(a="d",v=3:6,key="a"))
test(72, TESTDT[eval(CJ(a)),v,by=.EACHI], data.table(a="d",v=3:6,key="a"))
test(73, TESTDT[,v], 1:7) # still old behaviour for 1 year. WhenJsymbol option was set to FALSE at the top of this file
test(74, TESTDT[,3], data.table(v=1:7))
test(74.1, TESTDT[,4], error="outside the column number range.*1,ncol=3")
test(74.2, TESTDT[,3L], data.table(v=1:7))
test(74.3, TESTDT[,0], null.data.table())
test(75, TESTDT[,"v"], data.table(v=1:7))
test(76, TESTDT[,2:3], TESTDT[,2:3,with=FALSE])
test(77, TESTDT[,2:3,with=FALSE], data.table(b=c("e","e","f","f","i","i","b"),v=1:7))
test(78, TESTDT[,c("b","v")], data.table(b=c("e","e","f","f","i","i","b"),v=1:7))
colsVar = c("b","v")
test(79.1, TESTDT[,colsVar], error="column name 'colsVar' is not found")
test(79.2, TESTDT[,colsVar,with=FALSE], ans<-data.table(b=c("e","e","f","f","i","i","b"),v=1:7))
test(79.3, TESTDT[, ..colsVar], ans)
# works in test.data.table, but not eval(body(test.data.table)) when in R CMD check ... test(81, TESTDT[1:2,c(a,b)], factor(c("a","c","e","e")))
# It is expected the above to be common source of confusion. c(a,b) is evaluated within
# the frame of TESTDT, and c() creates one vector, not 2 column subset as in data.frame's.
# If 2 columns were required use list(a,b). c() can be useful too, but is different.
test(82, TESTDT[,c("a","b")], data.table(a=TESTDT[[1]], b=TESTDT[[2]], key=c("a","b")))
test(83, TESTDT[,list("a","b")], data.table(V1="a",V2="b"))
test(83.1, TESTDT[,list("sum(a),sum(b)")], data.table("sum(a),sum(b)"))
test(83.2, TESTDT[,list("sum(a),sum(b)"),by=a], {tt=data.table(a=c("a","c","d","g"),V1="sum(a),sum(b)",key="a");tt$V1=as.character(tt$V1);tt})
test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = 'a,b'))
# test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated
test(86, TESTDT[,sum(v),by="b"], data.table(b=c("e","f","i","b"),V1=INT(3,7,11,7))) # TESTDT is key'd by a,b, so correct that grouping by b should not be key'd in the result by default
test(87, TESTDT[,list(MySum=sum(v)),by="b"], data.table(b=c("e","f","i","b"),MySum=INT(3,7,11,7)))
test(88, TESTDT[,list(MySum=sum(v),Sq=v*v),by="b"][1:3], data.table(b=c("e","e","f"),MySum=INT(3,3,7),Sq=INT(1,4,9))) # silent repetition of MySum to match the v*v vector
# Test 89 dropped. Simplify argument no longer exists. by is now fast and always returns a data.table ... test(89, TESTDT[,sum(v),by="b",simplify=FALSE], list(7L,3L,7L,11L))
# Test 88.5 contributed by Johann Hibschman (for bug fix #1294) :
test(88.5, TESTDT[a=="d",list(MySum=sum(v)),by=list(b)], data.table(b=c("f","i"), MySum=INT(7,11))) # should not retain key because by= is not on head(key(x))
setkey(TESTDT,b)
test(90, TESTDT[J(c("f","i")),sum(v),by=.EACHI], data.table(b=c("f","i"),V1=c(7L,11L),key="b"))
test(90.5, TESTDT[J(c("i","f")),sum(v),by=.EACHI], data.table(b=c("i","f"),V1=c(11L,7L))) # test not keyed
test(91, TESTDT[SJ(c("f","i")),sum(v),by=.EACHI], data.table(b=c("f","i"),V1=c(7L,11L),key="b"))
# Test 92 dropped same reason as 89 ... test(TESTDT[92, J(c("f","i")),sum(v),mult="all",simplify=FALSE], list(7L,11L))
test(93, TESTDT[c("f","i"), which=TRUE], 4:7)
test(94, TESTDT[c("i","f"), mult="last", which=TRUE], INT(7,5))
test(95, TESTDT["f",v], 3:4)
test(96, TESTDT["f",v,by=.EACHI], data.table(b="f",v=3:4,key="b"))
test(97, TESTDT[c("f","i","b"),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c("f","i","b"), GroupSum=c(7L,11L,7L)))
# that line above doesn't create a key on the result so that the order fib is preserved.
test(98, TESTDT[SJ(c("f","i","b")),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c("b","f","i"), GroupSum=c(7L,7L,11L), key="b"))
# line above is the way to group, sort by group and setkey on the result by group.
dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B")
test(99, unique(dt, by=key(dt)), data.table(dt[c(1L, 4L, 5L, 7L, 9L, 10L)], key="A,B"))
# test [<- for column assignment
dt1 <- dt2 <- dt
test(100, {dt1[,"A"] <- 3L; dt1}, {dt2$A <- 3L; dt2})
# test transform and within
test(101, within(dt, {D <- B^2}), transform(dt, D = B^2))
test(102, within(dt, {A <- B^2}), transform(dt, A = B^2))
# test .SD object
test(103, dt[, sum(.SD$B), by = "A"], dt[, sum(B), by = "A"])
test(104.1, dt[, transform(.SD, D=min(B)), by="A"], dt[, list(B,C,D=min(B)), by="A"])
# and test transform of existing column to mimic .SD example in transform.Rd mentioned in commit to #1641
test(104.2, dt[, transform(.SD, B=sum(C)), by="A"], data.table(A=rep(1:3,each=4), B=86L, C=21:22, key="A"))
# test numeric and comparison operations on a data table
test(105, all(dt + dt > dt))
test(106, all(dt + dt > 1))
test(107, dt + dt, dt * 2L)
# test a few other generics:
test(108, dt, data.table(t(t(dt)),key="A,B"))
test(109, all(!is.na(dt)))
dt2 <- dt
dt2$A[1] <- NA # removes key
test(110, sum(is.na(dt2)), 1L)
test(111, {setkey(dt,NULL);dt}, na.omit(dt))
test(112, dt2[2:nrow(dt2),A], na.omit(dt2)$A)
# test [<- assignment:
dt2[is.na(dt2)] <- 1L
test(113, {setkey(dt,NULL);dt}, dt2) # key should be dropped because we assigned to a key column
# want to discourage this going forward (inefficient to create RHS like this)
# dt2[, c("A", "B")] <- dt1[, c("A", "B"), with = FALSE]
# test(114, dt1, dt2)
## doesn't work, yet:
## dt2[rep(TRUE, nrow(dt)), c("A", "B")] <- dt1[, c("A", "B"), with = FALSE]
## dt2[rep(TRUE, nrow(dt)), c("A")] <- dt1[, c("A"), with = FALSE]
## test(dt, dt2)) stop("Test 112 failed")
# test the alternate form of setkey:
dt1 = copy(dt)
dt2 = copy(dt)
setkeyv(dt1, "A")
setkey(dt2, A)
test(115, dt1, dt2)
# Test dogroups works correctly for character/factor columns
test(116, TESTDT[,a[1],by="b"], data.table(b=c("b","e","f","i"), V1=c("g","a","d","d"), key="b"))
test(117, TESTDT[,list(a[1],v[1]),by="b"], data.table(b=c("b","e","f","i"), V1=c("g","a","d","d"), V2=INT(7,1,3,5), key="b"))
# We no longer check i for out of bounds, for consistency with data.frame and e.g. cbind(DT[w],DT[w+1]). NA rows should be returned for i>nrow
test(118, TESTDT[8], data.table(a=as.character(NA), b=as.character(NA), v=as.integer(NA), key="b"))
test(119, TESTDT[6:9], data.table(a=c("d","d",NA,NA), b=c("i","i",NA,NA), v=c(5L,6L,NA,NA)))
# Tests of 0 and 1 row tables
TESTDT = data.table(NULL)
test(122, TESTDT[1], TESTDT)
test(123, TESTDT[0], TESTDT)
test(124, TESTDT[1:10], TESTDT)
test(125, TESTDT["k"], error="the columns to join by must be specified using")
# test 126 no longer needed now that test() has 'error' argument
TESTDT = data.table(a=3L,v=2L,key="a") # testing 1-row table
test(127, TESTDT[J(3)], TESTDT)
test(128, TESTDT[J(4)], data.table(a=4L,v=NA_integer_,key="a")) # see tests 206-207 too re the [NA]
test(129, TESTDT[J(4),roll=TRUE], data.table(a=4L,v=2L,key="a")) # the i values are in the result now (which make more sense for rolling joins, the x.a can still be accessed if need be)
test(130, TESTDT[J(4),roll=TRUE,rollends=FALSE], data.table(a=4L,v=NA_integer_,key="a"))
test(131, TESTDT[J(-4),roll=TRUE], data.table(a=-4L,v=NA_integer_,key="a"))
test(132, ncol(TESTDT[0]), 2L)
test(133, TESTDT[0][J(3)], data.table(a=3L,v=NA_integer_,key="a")) # These need to retain key for consistency (edge cases of larger sorted i)
# tests on data table names, make.names is now FALSE by default from v1.8.0
x = 2L; `1x` = 4L
dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x)
test(134, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7"))
dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names=TRUE)
test(134.5, names(dt), c("a.1", "b_1", "X1b", "a.1.1", "x", "V6", "V7"))
dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names = FALSE)
test(135, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7")) # the last two terms differ from data.frame()
test(136, dt[,b_1, by="a.1"], data.table(a.1=1L,"b_1"=2L))
test(137, dt[,`a 1`, by="a.1"], data.table(a.1=1L,"a 1"=4L, check.names=FALSE))
test(138, dt[,a.1, by="`a 1`"], data.table(`a 1`=4L,a.1=1L, check.names=FALSE))
# tests with NA's in factors
dt = data.table(a = c(NA, letters[1:5]), b = 1:6)
test(139, dt[,sum(b), by="a"], data.table(a = c(NA, letters[1:5]), V1 = 1:6))
# tests to make sure rbind and grouping keep classes
dt = data.table(a = rep(as.Date("2010-01-01"), 4), b = rep("a",4))
test(140, rbind(dt,dt), data.table(a = rep(as.Date("2010-01-01"), 8), b = rep("a",8)))
test(141, dt[,list(a=a), by="b"], dt[,2:1, with = FALSE])
dt$a <- structure(as.integer(dt$a), class = "Date")
test(142, dt[,list(b=b), by="a"], dt)
dt = data.table(x=1:5,y=6:10)
test(143, tail(dt), dt) # tail was failing if a column name was called x.
dt <- data.table(a = rep(1:3, each = 4), b = LETTERS[1:4], b2 = LETTERS[1:4])
test(144, dt[, .SD[3,], by=b], data.table(b=LETTERS[1:4],a=3L,b2=LETTERS[1:4]))
DT = data.table(x=rep(c("a","b"),c(2,3)),y=1:5)
xx = capture.output(ans <- DT[,{print(x);sum(y)},by=x,verbose=FALSE])
test(145, xx, c("[1] \"a\"","[1] \"b\""))
test(146, ans, data.table(x=c("a","b"),V1=c(3L,12L)))
test(147, DT[,MySum=sum(v)], error="unused argument") # user meant DT[,list(MySum=sum(v))]. FR#204 done.
dt = data.table(a=c(1L,4L,5L), b=1:3, key="a")
test(148, dt[CJ(2:3),roll=TRUE], data.table(a=c(2L,3L),b=c(1L,1L),key="a"))
test(149, dt[J(2:3),roll=TRUE], data.table(a=c(2L,3L),b=c(1L,1L))) # in future this will detect the subset is ordered and retain the key
# 150:158 test out of order factor levels in key columns (now allowed from v1.8.0)
dt = data.table(x=factor(c("c","b","a"),levels=c("b","a","c")), y=1:3)
setkey(dt,x)
test(150, dt["b", y, verbose=TRUE], 2L,
output="Matching character column i.V1 to factor levels in x.x") # changed i.V1 to i.x as per FR #2693
a = data.table(a=rep(1:5, 2), b=factor(letters[rep(1:5, each =2)], levels=letters[5:1]), key="b")
test(151, a[J("b"), a, verbose=TRUE], 3:4,
output="Matching character column i.V1 to factor levels in x.b") # message back to `i.V1` now. 'b' still accessible to satisfy FR #2693, checked on next line
# stretch tests further, two out of order levels, one gets key'd the other not :
a = data.table(x=factor(letters[rep(1:5, each =2)], levels=letters[5:1]),
y=factor(letters[rep(c(6,9,7,10,8), each =2)], levels=letters[10:6]),
z=1:10)
test(152, is.sorted(levels(a$x)), FALSE)
test(153, is.sorted(levels(a$y)), FALSE)
test(154, a[,sum(z),by=x][1,paste(x,V1)], "a 3") # ad hoc by doesn't sort the groups so 'a' (5th level) should be first
setkey(a,x) # 'e' (level 1) should come first now.
test(155, is.sorted(levels(a$x)), FALSE)
test(156, is.sorted(levels(a$y)), FALSE)
test(157, a[,sum(z),by=x][1,paste(x,V1)], "e 19") # 1st level is now first
test(158, a[,sum(z),by=y][1,paste(y,V1)], "h 19") # not 'f'
test(158.5, a[,sum(z),keyby=y][1,paste(y,V1)], "j 15") # not 'f' either
# tests of by expression variables
DT = data.table( a=1:5, b=11:50, d=c("A","B","C","D"), f=1:5, grp=1:5 )
f = quote( list(d) )
test(159, DT[,mean(b),by=eval(f)], DT[,mean(b),by=list(d)]) # column f doesn't get in the way of expression f
foo = function( grp ) {
DT[,mean(b),by=eval(grp)]
}
test(160, foo(quote(list(d))), DT[,mean(b),by=list(d)])
test(161, foo(quote(list(d,a))), DT[,mean(b),by=list(d,a)])
test(162, foo(quote(list(f))), DT[,mean(b),by=list(f)])
test(163, foo(quote(list(grp))), DT[,mean(b),by=list(grp)]) # grp local variable in foo doesn't conflict with column grp
test(164, foo(f), DT[,mean(b),by=d])
# checks that data.table inherits methods from data.frame in base ok
test(165, subset(DT,a>2), DT[a>2])
test(166, suppressWarnings(split(DT,DT$grp)[[2]]), DT[grp==2])
# and that plotting works
test(167.1, DT[,plot(b,f)], NULL)
test(167.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0
test(167.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer()))
try(graphics.off(),silent=TRUE)
# IDateTime conversion methods that ggplot2 uses (it calls as.data.frame method)
# Since %b is e.g. "nov." in LC_TIME=fr_FR.UTF-8 locale, we need to
# have the target/y value in these tests depend on the locale as well, #3450.
NOV = format(strptime("2000-11-01", "%Y-%m-%d"), "%b")
x = c("09:29:16","10:42:40","23:47:12","01:06:01","11:35:34","11:51:09")
datetimes = paste0("2011 ", NOV, c(18,18,18,19,19,19), " ", x)
DT = IDateTime(strptime(datetimes,"%Y %b%d %H:%M:%S"))
test(168.1, DT[,as.data.frame(itime)], data.frame(V1=as.ITime(x)))
test(168.2, as.character(DT[,as.POSIXct(itime,tz="UTC")]), paste(Sys.Date(), x))
test(168.3, as.character(DT[,as.POSIXct(idate,tz="UTC")]), c("2011-11-18","2011-11-18","2011-11-18","2011-11-19","2011-11-19","2011-11-19"))
# test of . in formula, using inheritance
DT = data.table(y=1:100,x=101:200,y=201:300,grp=1:5)
test(169,DT[,as.list(lm(y~0+.,.SD)$coefficients),by=grp][2,x]-2<1e-10, TRUE)
DT <- data.table( a=1:4, d=c("A","B","C","D") )
g <- quote( list( d ) )
test(170, DT[,list(d)], DT[,eval(g)])
DT = data.table(A=c(25L,85L,25L,25L,85L), B=c("a","a","b","c","c"), C=c(2,65,9,82,823))
test(171.1, DT[B=="b"][A==85], output="Empty data.table (0 rows and 3 cols): A,B,C")
test(171.2, DT[B=="b"][A==85,C], numeric())
test(171.3, DT[ , data.table( A, C )[ A==25, C ] + data.table( A, C )[ A==85, C ], by=B ], data.table(B=c("a","c"),V1=c(67,905)))
test(172, DT[ , list(3,data.table( A, C )[ A==25, C ] + data.table( A, C )[ A==85, C ]), by=B ], data.table(B=c("a","b","c"),V1=3,V2=c(67,NA,905)))
# Test growing result in memory (growVector). Usually the guess is good though.
DT = data.table(A=c(1L,1L,2L,2L,3L,3L), B=1:6)
# no rows for first group so guess for up-front allocate needs a reallocate ...
test(173.1, DT[,B[B>3],by=A][,V1], c(4L,5L,6L))
# cover raw type with the 3rd group result being unexpectedly larger ...
test(173.2, DT[, if(.GRP==3) as.raw(3:5) else as.raw(.GRP), by=A],
data.table(A=INT(1,2,3,3,3), V1=as.raw(1:5)))
# 2nd group returns more than the number of the rows in that group, and cover list type too ...
test(173.3, DT[, .(if(.GRP==2) list(3:5, "b", 0:1, mean) else list("a",1:2)), by=A],
data.table(A=INT(1,1,2,2,2,2,3,3), V1=list("a",1:2,3:5,"b",0:1,mean,"a",1:2)))
# Example taken from Harish post to datatable-help on 11 July
DT <- data.table(
A=c("a","a","b","b","d","c","a","d"),
B=c("x1","x2","x2","x1","x2","x1","x1","x2"),
C=c(5,2,3,4,9,5,1,9)
)
test(174, DT[,C[C-min(C)<3],by=list(A,B)][,V1], c(1,2,3,4,9,9,5))
test(175, DT[,C[C-min(C)<5],by=list(A,B)][,V1], c(5,1,2,3,4,9,9,5))
# Tests of data.table sub-assignments: $<-.data.table & [<-.data.table
DT = data.table(a = c("A", "Z"), b = 1:10, key = "a")
DT[J("A"),2] <- 100L # without L generates nice warning :-)
DT[J("A"),"b"] <- 1:5
DT[1:3,"b"] <- 33L
test(176, DT, data.table(a = rep(c("A", "Z"), each = 5),
b = as.integer(c(rep(33, 3), 4:5, seq(2, 10, by = 2))),
key = "a"))
DT[J("A"),"a"] <- "Z"
test(177, DT, data.table(a="Z", b=as.integer(c(rep(33, 3), 4:5, seq(2, 10, by = 2))))) # i.e. key dropped and column a still factor
DT <- data.table(a = c("A", "Z"), b = 1:10, key = "a")
DT$b[1:5] <- 1:5
DT$b[1:3] <- 33
test(178, DT, data.table(a = rep(c("A", "Z"), each = 5),
b = c(rep(33, 3), 4:5, seq(2, 10, by = 2)),
key = "a"))
DT$a <- 10:1
test(179, key(DT), NULL )
# Test logical in a key
DT = data.table(a=rep(1:3,each=2),b=c(TRUE,FALSE),v=1:6)
setkey(DT,a,b)
test(180, DT[J(2,FALSE),v], 4L)
test(181, DT[,sum(v),by=b][,V1], c(12L,9L))
# Test fix for bug 1026 reported by Harish V
# this test needed a unique var name to generate error 'object 'b' not found'.
# Otherwise it finds 'b' in local scope.
setnames(DT,2,"buniquename314")
bar = function( data, fcn ) {
q = substitute( fcn )
xx = data[,eval(q),by=a]
yy = data[,eval(substitute(fcn)),by=a]
identical(xx,yy)
}
test(182, bar( DT, sum(buniquename314) ), TRUE)
# Test bug 1005 reported by Branson Owen
DT = data.table(A = c("o", "x"), B = 1:10, key = "A")
test(183, DT[J(unique(A)), B], DT$B)
# Test bug 709 which returned an error here. And return type now empty table, #1945 in 1.8.1.
xx = data.table(a=1:5,b=6:10)
test(184, xx[a>6,sum(b),by=a], data.table(a=integer(),V1=integer()))
# Tests of bug 1015 highlight by Harish
# See thread "'by without by' now heeds nomatch=NA"
# Tests 185-201 were added in above next to originals
x <- data.table(a=c("a","b","d","e"),b=c("A","A","B","B"),d=c(1,2,3,4), key="a,b")
y <- data.table(g=c("a","b","c","d"),h=c("A","A","A","A"))
test(202, x[y], x[y,mult="all"])
test(203, x[y,d], c(1,2,NA,NA))
test(204, x[y,list(d)]$d, x[y,d])
test(205, x[y,list(d),mult="all"][,d], c(1,2,NA,NA))
# Test [NA] returns one NA row. NA is type *logical* so prior to
# change in v1.5, NA would get silently recycled and the whole table would
# be returned all NA (rarely useful and often confusing, but consistent
# with data.frame).
TESTDT = data.table(a=1:3,v=1:3,key="a")
test(206, TESTDT[NA], data.table(a=NA_integer_,v=NA_integer_,key="a")) # NA are now allowed in keys, so retains key
# TESTDT[NA] is expected to return a row of NA since nobody remembers that NA is different to NA_integer_
# Then user tries TESTDT[c(1,NA,2)] and it feels consistent to them since they see that row of NA in the middle
# But only the NA symbol is caught and replaced with NA_integer_, for this convenience.
# Otherwise logical expressions returning a single NA logical will still return empty, for consistency, #1252.
setkey(TESTDT,NULL)
test(207, TESTDT[NA], data.table(a=NA_integer_,v=NA_integer_))
# With inheritance, NROW and NCOL in base work nicely. No need for them in data.table.
test(208, NROW(TESTDT), 3L)
test(209, nrow(TESTDT), 3L)
test(210, NCOL(TESTDT), 2L)
test(211, ncol(TESTDT), 2L)
# Test infinite recursion error is trapped when a pre-1.5 data.table
# is used with 1.5 (bug #1008)
DT = data.table(a=1:6,key="a")
test(212, DT[J(3)]$a, 3L) # correct class c("data.table","data.frame")
class(DT) = "data.table" # incorrect class, but as from 1.8.1 it works. By accident when moving from colnames() to names(), it was dimnames() doing the check, but rather than add a check that identical(class(DT),c("data.frame","data.table")) at the top of [.data.table, we'll leave it flexible to user (user might not want to inherit from data.frame for some reason).
test(213, DT[J(3)]$a, 3L)
# setkey now auto coerces double and character for convenience, and
# to solve bug #953
DF = data.frame(a=LETTERS[1:10], b=1:10, stringsAsFactors=FALSE)
DT = data.table(DF)
setkey(DT,a) # used to complain about character
test(215, DT["C",b], 3L)
DT = data.table(DF,key="a")
test(216, DT["C",b], 3L)
DT = data.table(a=c(1,2,3),v=1:3,key="a")
test(217, DT[J(2),v], 2L)
DT = data.table(a=c(1,2.1,3),v=1:3,key="a")
test(218, DT[J(2.1),v], 2L)
# tests of quote()-ed expressions in i. Bug #1058
DT = data.table(a=1:5,b=6:10,key="a")
q = quote(a>3)
test(220, DT[eval(q),b], 9:10)
test(221, DT[eval(parse(text="a>4")),b], 10L)
test(222, DT[eval(parse(text="J(2)")),b], 7L)
# lists in calling scope should be ok as single names passed to by, bug #1060
DT = data.table(a=1:2,b=rnorm(10))
byfact = DT[,a] # vector, ok before fix but check anyway
test(223, DT[,mean(b),by=byfact], DT[,mean(b),by=list(byfact)])
byfact = DT[,list(a)] # this caused next line to fail before fix
test(224, DT[,mean(b),by=byfact], DT[,mean(b),by=as.list(byfact)])
test(225, DT[,mean(b),by=byfact], DT[,mean(b),by={byfact}])
# tests for building expressions via parse, bug #1243
dt1key<-data.table(A1=1:100,onekey=rep(1:2,each=50))
setkey(dt1key,onekey)
ASumExpr<-parse(text="quote(sum(A1))") # no need for quote but we test it anyway because that was work around when test 227 failed
ASumExprNoQ<-parse(text="sum(A1)")
ans = dt1key[,sum(A1),by=onekey]
test(226,ans,dt1key[,eval(eval(ASumExpr)),by=onekey])
test(227,ans,dt1key[,eval(ASumExprNoQ),by=onekey])
# test for uncommon grouping pattern on 1-row data.table, bug #1245
DT = data.table(a=1L,b=2L)
test(228,DT[,list(1:2),by=a],data.table(a=c(1L,1L),V1=1:2))
# special case j=.SD, bug #1247
DT = data.table(a=rep(1:2,each=2),b=1:4)
test(229,DT[,.SD,by=a],DT)
setkey(DT,a)
test(229.1,DT[,.SD,by=key(DT)],DT)
# merge bug with column 'x', bug #1229
d1 <- data.table(x=c(1,3,8),y1=rnorm(3), key="x")
d2 <- data.table(x=c(3,8,10),y2=rnorm(3), key="x")
ans1=merge(d1, d2, by="x")
ans2=cbind(d1[2:3],y2=d2[1:2]$y2);setkey(ans2,x)
test(230, ans1, ans2)
# one column merge, bug #1241
DT = data.table(a=rep(1:2,each=3),b=1:6,key="a")
y = data.table(a=c(0,1),bb=c(10,11),key="a")
test(231,merge(y,DT),data.table(a=1L,bb=11,b=1:3,key="a"))
test(232,merge(y,DT,all=TRUE),data.table(a=rep(c(0L,1L,2L),c(1,3,3)),bb=rep(c(10,11,NA_real_),c(1,3,3)),b=c(NA_integer_,1:6),key="a"))
y = data.table(a=c(0,1),key="a") # y with only a key column
test(233,merge(y,DT),data.table(a=1L,b=1:3,key="a"))
test(234,merge(y,DT,all=TRUE),data.table(a=rep(c(0L,1L,2L),c(1,3,3)),b=c(NA_integer_,1:6),key="a"))
# 'by' when DT contains list columns
DT = data.table(a=c(1,1,2,3,3),key="a")
DT$b=list(1:2,1:3,1:4,1:5,1:6)
test(235,DT[,mean(unlist(b)),by=a],data.table(a=c(1,2,3),V1=c(1.8,2.5,mean(c(1:5,1:6))),key="a"))
test(236,DT[,sapply(b,mean),by=a],data.table(a=c(1,1,2,3,3),V1=c(1.5,2.0,2.5,3.0,3.5),key="a"))
# when i is a single name, it no longer evaluates within data.table scope
DT = data.table(a=1:5,b=rnorm(5),key="a")
a = list(4)
test(237,DT[a],DT[J(4)])
# repeat earlier test with xkey instead of x. xkey is internal to merge; the bigger problem Tom mentioned.
d1 <- data.table(xkey=c(1,3,8),y1=rnorm(3), key="xkey")
d2 <- data.table(xkey=c(3,8,10),y2=rnorm(3), key="xkey")
ans2=cbind(d1[2:3],y2=d2[1:2]$y2);setkey(ans2,xkey)
test(238, merge(d1, d2, by="xkey"), ans2)
# Join Inherited Scope, and X[Y] including Y's non-join columns
X=data.table(a=rep(1:3,c(3,3,2)),foo=1:8,key="a")
Y=data.table(a=2:3,bar=6:7)
test(239, X[Y,sum(foo),by=.EACHI], data.table(a=2:3,V1=c(15L,15L),key="a"))
test(240, X[Y,sum(foo*bar),by=.EACHI], data.table(a=2:3,V1=c(90L,105L),key="a"))
test(241, X[Y], data.table(a=rep(2:3,3:2),foo=4:8,bar=rep(6:7,3:2),key="a"))
test(242, X[Y,list(foo,bar),by=.EACHI][,sum(foo*bar)], 195L)
test(243, X[Y][,sum(foo*bar)], 195L)
# not sure about these yet :
# test(244, X[Y,sum(foo*bar),mult="first"], data.table(a=2:3,V1=c(24L,49L)))
# test(245, X[Y,sum(foo*bar),mult="last"], data.table(a=2:3,V1=c(36L,56L)))
# joining to less than all X's key colums (in examples but can't see formal test)
X=data.table(a=rep(LETTERS[1:2],2:3),b=1:5,v=10:14,key="a,b")
test(246.1, X["A"], X[1:2]) # checks that X[1:2] retains key, too
test(246.2, key(X["A"]), c("a","b"))
test(247, X["C"]$v, NA_integer_)
test(248, nrow(X["C",nomatch=0]), 0L)
x=data.table( a=c("a","b","c"), b=1:3, key="a" )
y=data.table( a=c("b","d","e"), d=c(8,9,10) )
test(249, x[y], data.table(a=c("b","d","e"),b=c(2L,NA,NA),d=c(8,9,10))) # keeps i join cols
test(250, x[y,mult="first"], data.table(a=c("b","d","e"),b=c(2L,NA,NA),d=c(8,9,10))) # same
x=data.table( a=c("a","b","b","c"), b=1:4, key="a" )
y=data.table(a=c("b","d","b"), d=c(8,9,10))
test(251, x[y, allow.cartesian=TRUE], data.table(a=c("b","b","d","b","b"),b=c(2:3,NA,2:3),d=c(8,8,9,10,10)))
# auto coerce float to int in ad hoc by (just like setkey), FR#1051
DT = data.table(a=INT(1,1,1,2,2),v=1:5)
test(252, DT[,sum(v),by=a], data.table(a=1:2,V1=c(6L,9L)))
# check that by retains factor columns, since character is now default
DT = data.table(a=factor(c("A","A","A","B","B")),v=1:5)
test(253, DT[,sum(v),by=a], data.table(a=factor(c("A","B")),V1=c(6L,9L)))
# fix for bug #1298 with by=key(DT) and divisibility error.
DT=data.table(a=c(1,1,1,2,2),b=1:5,key="a")
test(254, DT[,sum(b),by=key(DT)]$V1, c(6L,9L))
# for for bug #1294 (combining scanning i and by)
# also see test 88.5 contributed by Johann Hibschman above.
DT = data.table(a=1:12,b=1:2,c=1:4)
test(255, DT[a>5,sum(c),by=b]$V1, c(12L, 7L))
# fix for bug #1301 (all.vars() doesn't appear to find fn in fns[[fn]] usage)
DT = data.table(a=1:6,b=1:2,c=letters[1:2],d=1:6)
fns = list(a=max,b=min)
test(256, DT[,fns[[b[1]]](d),by=c]$V1, c(5L,2L))
test(257, DT[,fns[[c[1]]](d),by=c]$V1, c(5L,2L))
fns=c(max,min)
DT = data.table(ID=1:10, SCORE_1=1:10, SCORE_2=11:20, SCORE_3=30:21, fn=c(rep(1, 5), rep(2, 5)))
test(258, DT[,fns[[fn]](SCORE_1,SCORE_2,SCORE_3),by=ID]$V1, c(30:26,6:10))
test(259, DT[,as.list(fns[[fn]](SCORE_1,SCORE_2,SCORE_3)),by=ID]$V1, c(30:26,6:10))
test(260, DT[,list(fns[[fn]](SCORE_1,SCORE_2,SCORE_3)),by=ID]$V1, c(30:26,6:10))
# fix for bug #1340 - Duplicate column names in self-joins (but print ok)
DT <- data.table(id=1:4, x1=c("a","a","b","c"), x2=c(1L,2L,3L,3L), key="x1")
test(261, DT[DT, allow.cartesian=TRUE][id < i.id]$i.x2, 2L)
# "<-" within j now assigns in the same environment for 1st group, as the rest
# Thanks to Andeas Borg for highlighting on 11 May
dt <- data.table(x=c(0,0,1,0,1,1), y=c(0,1,0,1,0,1), z=1:6)
groupInd = 0
test(262, dt[,list(z,groupInd<-groupInd+1),by=list(x,y)]$V2, c(1,2,2,3,3,4))
test(263, groupInd, 0)
test(264, dt[,list(z,groupInd<<-groupInd+1),by=list(x,y)]$V2, c(1,2,2,3,3,4))
test(265, groupInd, 4)
# Tests for passing 'by' expressions that evaluate to character column
# names in the edge case of 1 row; the character 'by' vector could
# feasibly be intended to be grouping values. Bug 1404; thanks to Andreas Borg
# for the detailed report, suggested fix and tests.
DT = data.frame(x=1,y="a",stringsAsFactors=FALSE)
DT = as.data.table(DT)
test(266,class(DT$y),"character") # just to check we setup the test correctly
test(267,DT[,sum(x),by=y]$V1,1)
test(268,DT[,sum(x),by="y"]$V1,1)
colvars="y"
test(269,DT[,sum(x),by=colvars]$V1,1)
setkey(DT,y)
test(270,DT[,sum(x),by=key(DT)]$V1,1)
DT = data.table(x=1,y=2)
setkeyv(DT,names(DT))
test(271, DT[,length(x),by=key(DT)]$V1, 1L)
DT = data.table(x=c(1,2,1), y=c(2,3,2), z=1:3)
setkeyv(DT,names(DT))
test(272, DT[,sum(z),by=key(DT)]$V1, c(1L,3L,2L))
# Tests for .BY and implicit .BY
# .BY is a single row, and by variables are now, too. FAQ 2.10 has been changed accordingly.
DT = data.table(a=1:6,b=1:2)
test(273, DT[,sum(a)*b,by=b]$V1, c(9L,24L))
test(274, DT[,sum(a)*.BY[[1]],by=b], data.table(b=1:2,V1=c(9L,24L)))
test(275, DT[,sum(a)*bcalc,by=list(bcalc=b+1L)], data.table(bcalc=2:3,V1=c(18L,36L)))
test(276, DT[,sapply(.SD,sum)*b,by=b], data.table(b=1:2,V1=c(9L,24L))) # .SD should no longer include b, unlike v1.6 and before
test(277, DT[,sapply(.SD,sum)*bcalc,by=list(bcalc=b+1L)], data.table(bcalc=2:3,V1=c(18L,36L))) # cols used in by expressions are excluded from .SD, but can still be used in j (by name only and may vary within the group e.g. DT[,max(diff(date)),by=month(date)]
test(278, DT[,sum(a*b),by=list(bcalc=b+1L)], data.table(bcalc=2:3,V1=c(9L,24L)))
# Test x==y where either column contain NA.
DT = data.table(x=c(1,2,NA,3,4),y=c(0,2,3,NA,4),z=1:5)
test(279, DT[x==y,sum(z)], 7L)
# In data.frame the equivalent is :
# > DF = as.data.frame(DT)
# > DF[DF$x==DF$y,]
# x y z
# 2 2 2 2
# NA NA NA NA
# NA.1 NA NA NA
# 5 4 4 5
# > DF[!is.na(DF$x) & !is.na(DF$y) & DF$x==DF$y,]
# x y z
# 2 2 2 2
# 5 4 4 5
# Test that 0 length columns are expanded with NA to match non-0 length columns, bug fix #1431
DT = data.table(pool = c(1L, 1L, 2L), bal = c(10, 20, 30))
test(280, DT[, list(bal[0], bal[1]), by=pool], data.table(pool=1:2, V1=NA_real_, V2=c(10,30)))
test(281, DT[, list(bal[1], bal[0]), by=pool], data.table(pool=1:2, V1=c(10,30), V2=NA_real_))
# Test 2nd group too (the 1st is special) ...
test(282, DT[, list(bal[ifelse(pool==1,1,0)], bal[1]), by=pool], data.table(pool=1:2, V1=c(10,NA), V2=c(10,30)))
# More tests based on Andreas Borg's post of 11 May 2011.
DT = data.table(x=INT(0,0,1,0,1,1), y=INT(1,1,0,1,1,1), z=1:6)
ans = data.table(x=c(0L,1L,1L),y=c(1L,0L,1L),V1=c(1L,1L,2L),V2=c(7L,3L,11L))
test(283, DT[,list(sum(x[1], y[1]),sum(z)), by=list(x,y)], ans)
test(284, DT[,list(sum(unlist(.BY)),sum(z)),by=list(x,y)], ans)
groupCols = c("x", "y")
test(285, DT[,list(sum(unlist(.BY)),sum(z)),by=groupCols], ans)
groupExpr = quote(list(x,y))
test(286, DT[,list(sum(unlist(.BY)),sum(z)),by=groupExpr], ans)
# Bug fix from Damian B on 25 June 2011 :
DT = data.table(X=c(NA,1,2,3), Y=c(NA,2,1,3))
setkeyv(DT,c("X","Y"))
test(287, unique(DT, by=key(DT)), DT)
# Bug fix #1421: using vars in calling scope in j when i is logical or integer.
DT = data.table(A=c("a","b","b"),B=c(4,5,NA))
myvar = 6
test(288, DT[A=="b",B*myvar], c(30,NA))
# Test new feature in 1.6.1 that i can be plain list (such as .BY)
DT = data.table(grp=c("a","a","a","a","b","b","b"),v=1:7)
mysinglelookup = data.table(grp=c("a","b"),s=c(42,84),grpname=c("California","New York"),key="grp")
setkey(mysinglelookup,grp)
test(289, DT[,sum(v*mysinglelookup[.BY]$s),by=grp], data.table(grp=c("a","b"),V1=c(420,1512)))
# In v1.6.2 we will change so that single name j returns a vector, regardless of grouping
test(290, DT[,list(mysinglelookup[.BY]$grpname,sum(v)),by=grp], data.table(grp=c("a","b"),V1=c("California","New York"),V2=c(10L,18L)))
# Test user defined attributes are retained, see comment in FR#1006
DT = data.table(a=as.numeric(1:2),b=3:4)
setattr(DT,"myuserattr",42)
setkey(DT,a) # a is numeric so a change of type to integer occurs, too, via := which checks selfref is ok
test(291, attr(DT,"myuserattr"), 42)
# Test new .N symbol
DT = data.table(a=INT(1,1,1,1,2,2,2),b=INT(3,3,3,4,4,4,4))
test(292, DT[,.N,by=list(a,b)], data.table(a=c(1L,1L,2L),b=c(3L,4L,4L),N=c(3L,1L,3L)))
test(293, DT[,list(a+b,.N),by=list(a,b)], data.table(a=c(1L,1L,2L),b=c(3L,4L,4L),V1=4:6,N=c(3L,1L,3L)))
# Test that setkey and := syntax really are by reference, even within functions. You
# really do need to take a copy first to a new name; force(x) isn't enough.
DT = data.table(a=1:3,b=4:6)
f = function(x){ force(x)
setkey(x) }
f(DT)
test(294,key(DT),c("a","b")) # The setkey didn't copy to a local variable. Need to copy first to local variable (with a new name) if required.
f = function(x){ force(x)
x[,a:=42L] }
f(DT)
test(295,DT,data.table(a=42L,b=4:6)) # := was by reference (fast) and dropped the key, too, because assigned to key column
DT = data.table(a=1:3,b=4:6)
f = function(x){ x = copy(x)
setkey(x) }
f(DT)
test(295.1,key(DT),NULL)
setkey(DT,a)
f = function(x){ x = copy(x)
x[,b:=10:12][J(2),b] } # test copy retains key
test(295.2,f(DT),11L)
test(295.3,DT,data.table(a=1:3,b=4:6,key="a")) # The := was on the local copy
# new feature added 1.6.3, that key can be vector.
test(296,data.table(a=1:3,b=4:6,key="a,b"),data.table(a=1:3,b=4:6,key=c("a","b")))
# test .SDcols (not speed, just operation)
DT = data.table(grp=1:3,A1=1:9,A2=10:18,A3=19:27,B1=101:109,B2=110:118,B3=119:127,key="grp")
test(297,DT[,list(A1=sum(A1),A2=sum(A2),A3=sum(A3)),by=grp], DT[,lapply(.SD,sum),by=grp,.SDcols=2:4])
DT = data.table(a=1:3,b=4:6)
test(298, {DT$b<-NULL;DT}, data.table(a=1:3)) # delete column
test(299.01, {DT$c<-as.character(DT$c);DT}, data.table(a=1:3, c=NA_character_)) # Column c is missing, so DT$c is NULL.
test(299.02, DT[,c:=""], data.table(a=1:3,c=""))
test(299.03, truelength(DT)>length(DT)) # the := over-allocated, by 100 by default, but user may have changed default so just check '>'
# FR #2551 - old 299.3 and 299.5 are changed to include length(RHS) > 1 to issue the warning
DT[,c:=rep(42L,.N)] # plonk
test(299.04, DT, data.table(a=1:3, c=42L))
test(299.05, DT[2:3,c:=c(43, 44)], data.table(a=1:3,c=42:44))
# FR #2551 - length(RHS) = 1 - no warning for type conversion
test(299.06, DT[2,c:=42], data.table(a=1:3,c=INT(42,42,44)))
# also see tests 302 and 303. (Ok, new test file for fast assign would be tidier).
test(299.07, DT[,c:=rep(FALSE,nrow(DT))], data.table(a=1:3,c=FALSE)) # replace c column with logical
test(299.08, DT[2:3,c:=c(3.14,0)], data.table(a=1:3, c=c(FALSE,TRUE,FALSE)), warning="3.14.*double.*at RHS position 1 taken as TRUE.*logical.*column 2 named 'c'")
test(299.09, DT[2:3,c:=c(0,1)], data.table(a=1:3,c=c(FALSE,FALSE,TRUE))) # no warning
# FR #2551 is now changed to fit in / fix bug #35. Stricter warnings are in place now. Check tests 1294.1-34 below.
test(299.10, DT[2,c:=42], data.table(a=1:3, c=c(FALSE,TRUE,TRUE)), warning="42.0.*double.*at RHS position 1.*TRUE")
test(299.11, DT[1,c:=42L], data.table(a=1:3, c=TRUE), warning="42.*integer.*at RHS position 1.*TRUE.*logical.*column 2 named 'c'")
test(299.12, DT[2:3,c:=c(0L, 0L)], data.table(a=1:3,c=c(TRUE,FALSE,FALSE)))
# Test bug fix #1468, combining i and by.
DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b")
test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L))) # should not retain key because by= is not on head(key(DT))
test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L)))
# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix)
# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which
# this tests. But it's well under 10 seconds.
DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep="")))
test(301, nrow(DT[,sum(B),by=C])==100010)
DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep=""))
test(301.1, nrow(DT[,sum(B),by=C])==100010)
# Test fast assign
DT = data.table(a=c(1L,2L,2L,3L),b=4:7,key="a")
DT[2,b:=42L] # needs to be on its own line to test DT symbol is changed by reference
test(302, DT, data.table(a=c(1L,2L,2L,3L),b=c(4L,42L,6L,7L),key="a"))
DT[J(2),b:=84L]
test(303, DT, data.table(a=c(1L,2L,2L,3L),b=c(4L,84L,84L,7L),key="a"))
# Test 304 was testing compatibility with package:plyr. Moved to the ggplot2 block above to be moved to a separate test package.
# Test that changing colnames keep key in sync.
# TO DO: will have to do this for secondary keys, too, when implemented.
DT = data.table(x=1:10,y=1:10,key="x")
setnames(DT,c("a","b"))
test(305, key(DT), "a")
setnames(DT,"a","R")
test(306, key(DT), "R")
setnames(DT,"b","S")
test(307, key(DT), "R")
setnames(DT,c("a","b"))
test(308, key(DT), "a")
setnames(DT,1,"R")
test(309, key(DT), "R")
# Test :=NULL
DT = data.table(x=1:5,y=6:10,z=11:15,key="y")
test(310, DT[,x:=NULL], data.table(y=6:10,z=11:15,key="y")) # delete first
test(311, DT[,y:=NULL], data.table(z=11:15)) # deleting key column also removes key
test(312, DT[,z:=NULL], data.table(NULL)) # deleting all
test(313, DT[,a:=1:3], data.table(a=1:3)) # test changed in 1.12.2; can now add a column to a null (0-column) data.table
DT = data.table(a=20:22)
test(314, {DT[,b:=23:25];DT[,c:=26:28]}, data.table(a=20:22,b=23:25,c=26:28)) # add in series
test(315, DT[,c:=NULL], data.table(a=20:22,b=23:25)) # delete last
test(316, DT[,c:=NULL], data.table(a=20:22,b=23:25), warning="Column 'c' does not exist to remove")
# Test adding, removing and updating columns via [<- in one step
DT = data.table(a=1:6,b=1:6,c=1:6)
DT[,c("a","c","d","e")] <- list(NULL,11:16,42L,21:26)
test(317, DT, data.table(b=1:6,c=11:16,d=42L,e=21:26))
# Other assignments (covers DT[x==2, y:=5] too, #1502)
DT[e<24,"b"] <- 99L
test(318, DT, data.table(b=c(99L,99L,99L,4L,5L,6L),c=11:16,d=42L,e=21:26))
test(319, DT[b!=99L,b:=99L], data.table(b=99L,c=11:16,d=42L,e=21:26))
# previous within functionality restored, #1498
DT = data.table(a=1:10)
test(320, within(DT, {b <- 1:10; c <- a + b})[,list(a,b,c)], data.table(a=1:10,b=1:10,c=as.integer(seq(2,20,length.out=10))))
# not sure why within makes columns in order a,c,b, but it seems to be a data.frame thing, too.
test(321, transform(DT,b=42L,e=a), data.table(a=1:10,b=42L,e=1:10))
DT = data.table(a=1:5, b=1:5)
test(322, within(DT, rm(b)), data.table(a=1:5))
# check that cbind dispatches on first argument as expected
test(323, cbind(DT,DT), data.table(a=1:5,b=1:5,a=1:5,b=1:5)) # no check.names as from v1.8.0 (now we have :=, cbind is used far less anyway)
test(324, cbind(DT,data.frame(c=1:5)), data.table(a=1:5,b=1:5,c=1:5))
test(325, rbind(DT,DT), data.table(a=c(1:5,1:5),b=1:5))
test(326, rbind(DT,data.frame(a=6:10,b=6:10)), data.table(a=1:10,b=1:10))
# test removing multiple columns, and non-existing ones, #1510
DT = data.table(a=1:5, b=6:10, c=11:15)
test(327, within(DT,rm(a,b)), data.table(c=11:15))
test(328, within(DT,rm(b,c)), data.table(a=1:5))
test(329, within(DT,rm(b,a)), data.table(c=11:15))
test(330, within(DT,rm(b,c,d)), data.table(a=1:5), warning="object 'd' not found")
DT[,c("b","a")]=NULL
test(332, DT, data.table(c=11:15))
test(333, within(DT,rm(c)), data.table(NULL))
DT = data.table(a=1:5, b=6:10, c=11:15)
DT[,2:1]=NULL
test(334, DT, data.table(c=11:15))
test(335, DT[,2:1]<-NULL, error="Attempt to assign to column")
DT = data.table(a=1:2, b=1:6)
test(336, DT[,z:=a/b], data.table(a=1:2,b=1:6,z=(1:2)/(1:6)))
test(337, DT[3:4,z:=a*b], data.table(a=1:2,b=1:6,z=c(1,1,3,8,1/5,2/6)))
# test eval of LHS of := (using with=FALSE gives a warning here from v1.9.3)
DT = data.table(a=1:3, b=4:6)
test(338, DT[,2:=42L], data.table(a=1:3,b=42L))
test(339, DT[,2:1:=list(10:12,3L)], data.table(a=3L,b=10:12))
test(340, DT[,"a":=7:9], data.table(a=7:9,b=10:12))
test(341, DT[,c("a","b"):=1:3], data.table(a=1:3,b=1:3))
mycols = "a"
test(342, DT[,(mycols):=NULL], data.table(b=1:3))
mynewcol = "newname"
test(343, DT[,(mynewcol):=21L], data.table(b=1:3,newname=21L))
mycols = 1:2
test(344, DT[,(mycols):=NULL], data.table(NULL))
# this originally tested that IDate would come back as double from rbind(DF, DF)
# Now #2008 is fixed in v1.12.4 it now checks it stays as an integer.
DF = data.frame(x=as.IDate(c("2010-01-01","2010-01-02")), y=1:6)
DT = as.data.table(rbind(DF,DF))
test(345, DT[,sum(y),by=x], data.table(x=as.IDate(c("2010-01-01","2010-01-02")),V1=c(18L,24L)))
test(346, setkey(DT,x)[J(as.IDate("2010-01-02"))], data.table(x=as.IDate(rep("2010-01-02",6L)), y=rep(c(2L,4L,6L),2), key="x"))
# Test .N==0 with nomatch=NA|0, # tests for #963 added as well
DT = data.table(a=1:2,b=1:6,key="a")
test(349, DT[J(2:3),.N,nomatch=NA,by=.EACHI]$N, c(3L,0L))
test(350, DT[J(2:3),.N,nomatch=0], c(3L))
# Test first .N==0 with nomatch=NA|0
test(350.1, DT[J(2:3),.N], c(4L))
test(350.2, DT[J(4),.N], 1L)
test(350.3, DT[J(4),.N,nomatch=0L], 0L)
test(350.4, DT[J(4:5),.N,nomatch=0L], 0L)
test(350.5, DT[J(0:4),.N,by=.EACHI]$N, c(0L,3L,3L,0L,0L))
test(350.6, DT[c(0,0,0), .N], 0L)
# Test recycling list() on RHS of :=
DT = data.table(a=1:3,b=4:6,c=7:9,d=10:12)
test(351.1, DT[, c("a","b"):=list(13:15), verbose=TRUE], ans<-data.table(a=13:15,b=13:15,c=7:9,d=10:12),
output="RHS_list_of_columns == true.*Recycling single RHS list item across 2 columns")
test(351.2, DT[, c("a","b"):=13:15, verbose=TRUE], ans,
notOutput="revised")
test(352.1, DT[,letters[1:4]:=list(1L,NULL)], error="Supplied 4 columns to be assigned 2 items. Please see NEWS for v1.12.2")
test(352.2, DT[,letters[1:4]:=list(1L,NULL,2L,NULL)], data.table(a=c(1L,1L,1L),c=c(2L,2L,2L)))
# Test assigning new levels into factor columns
DT = data.table(f=factor(c("a","b")),x=1:4)
test(353, DT[2,f:="c"], data.table(f=factor(c("a","c","a","b")),x=1:4))
test(354, DT[3,f:=factor("foo")], data.table(f=factor(c("a","c","foo","b")),x=1:4))
# Test growVector logic when adding levels (don't need to grow levels for character cols)
newlevels = as.character(as.hexmode(1:2000))
DT = data.table(f=factor("000"),x=1:2010)
test(355, DT[11:2010,f:=newlevels], data.table(f=factor(c(rep("000",10),newlevels)),x=1:2010))
DT = data.table(f=c("a","b"),x=1:4)
# Test coercing factor to character column
test(355.5, DT[3,f:=factor("foo")], data.table(f=c("a","b","foo","b"),x=1:4))
test(355.6, DT[4,f:=factor("bar"),verbose=TRUE], data.table(f=c("a","b","foo","bar"),x=1:4), notOutput="coerce")
# See datatable-help post and NEWS item for 1.6.7
DT = data.table(X=factor(letters[1:10]), Y=1:10)
DT$X = "Something Different"
test(356, DT, data.table(X=factor("Something Different",levels=c(letters[1:10],"Something Different")), Y=1:10))
DT = data.table(X=letters[1:10], Y=1:10)
DT$X = "Something Different"
test(356.5, DT, data.table(X="Something Different", Y=1:10))
# Bug fix 1570
DT = data.table(x=1:5,y=1:5)
test(357, DT[x==0, y:=5L], data.table(x=1:5,y=1:5))
test(358, DT[FALSE, y:=5L], data.table(x=1:5,y=1:5))
# Bug fix 1599
DT = data.table(a=1:2,b=1:6)
test(359, DT[,sum(b),by=NULL], data.table(V1=21L))
test(360, DT[,sum(b),by=character(0)], data.table(V1=21L))
# Bug fix 1576 : NULL j results in 'inconsistent types' error
DT = data.table(a=1:3,b=1:9)
ans = data.table(a=c(1L,3L),V1=c(12L,18L))
test(361, DT[,if (a==2) NULL else sum(b),by=a], ans)
test(362, DT[,if (a==2) data.table(NULL) else sum(b),by=a], ans)
test(363, DT[,if (a==2) as.list(NULL) else sum(b),by=a], ans)
test(364, DT[,if (a==2) integer(0) else sum(b),by=a], ans)
# Test that data.table() can create list() columns directly
# NB: test 235 above ('by' when DT contains list columns) created the list column in two steps, no longer necessary
DT = data.table(a=1:2,b=list("h",7:8))
test(365, DT[1,b], list("h")) # should it be a special case for 1-item results to unlist? Don't think so: in keeping with no drop=TRUE principle
test(366, DT[2,b], list(7:8))
DT = data.table(a=1:4,b=list("h",7:8),c=list(matrix(1:12,3),data.table(a=letters[1:3],b=list(1:2,3.4,"k"),key="a")))
test(367, DT[3,b], list("h"))
test(368, DT[4,b], list(7:8))
test(369, DT[3,c[[1]][2,3]], 8L)
test(370, DT[4,c[[1]]["b",b]][[1]], 3.4)
# Test returning a list() column via grouping
DT = data.table(x=INT(1,1,2,2,2),y=1:5)
test(371, DT[,list(list(unique(y))),by=x], data.table(x=1:2,V1=list(1:2,3:5)))
# Test matrix i is an error
test(372, DT[matrix(1:2,ncol=2)], error="i is invalid type (matrix)")
# Tests from bug fix #1593
DT = data.table(x=letters[1:2], y=1:4)
DT[x == "a", ]$y <- 0L
test(373, DT, data.table(x=letters[1:2], y=c(0L,2L,0L,4L)))
DT = data.table(x=letters[1:2], y=1:4, key="x")
DT["a", ]$y <- 0L
test(374, DT, data.table(x=letters[1:2], y=c(0L,2L,0L,4L), key="x"))
DT = data.table(x=letters[1:2], y=1:4)
DT[c(1,3), ]$y <- 0L
test(375, DT, data.table(x=letters[1:2], y=c(0L,2L,0L,4L)))
# Test unique on unsorted tables (and tolerance on numeric columns, too)
DT = data.table(a=c(2,1,2),b=c(1,2,1))
test(376, unique(DT), data.table(a=c(2,1),b=c(1,2)))
# From the SO thread :
M = matrix(sample(2, 120, replace = TRUE), ncol = 3)
DF = as.data.frame(M)
DT = as.data.table(M)
test(377, as.data.table(unique(DF)), unique(DT))
# Test compatibility with sqldf. sqldf() does a do.call("rbind" with empty input,
# so this tests ..1 when NULL (which was insufficiently list(...)[[1]] in 1.6.6).
# We now test this directly rather than using sqldf, because we couldn't get 'R CMD check'
# past "(converted from warning) closing unused connection 3 (/tmp/RtmpYllyW2/file55822c52)"
test(378, cbind(), NULL)
test(379, rbind(), NULL)
DT = data.table(a=rep(1:3,1:3),b=1:6)
test(380, DT[,{.SD$b[1]=10L;.SD}, by=a], error="locked binding") # .SD locked for 1st group
test(381, DT[,{if (a==2) {.SD$b[1]=10L;.SD} else .SD}, by=a], error="locked binding") # .SD locked in 2nd group onwards too
# test that direct := is trapped, but := within a copy of .SD is allowed (FAQ 4.5). See also tests 556-557.
test(382, DT[,b:=.N*2L,by=a], data.table(a=rep(1:3,1:3),b=rep(2L*(1:3),1:3)))
test(383, DT[,{z=10L;b:=z},by=a], error=":= and `:=`(...) are defined for use in j, once only and in particular ways")
test(384, DT[,{mySD=copy(.SD);mySD[1,b:=99L];mySD},by=a], data.table(a=rep(1:3,1:3),b=c(99L,99L,4L,99L,6L,6L)))
# somehow missed testing := on logical subset with mixed TRUE/FALSE, reported by Muhammad Waliji
DT = data.table(x=1:2, y=1:6)
test(385, DT[x==1, y := x], data.table(x=1:2,y=c(1L,2L,1L,4L,1L,6L)))
test(386.1, DT[c(FALSE,TRUE)], error="i evaluates to.*Recycling of logical i is no longer allowed.*use rep.*[.]N")
test(386.2, DT[rep(c(FALSE,TRUE),length.out=.N),y:=99L], data.table(x=1:2,y=c(1L,99L,1L,99L,1L,99L)))
# test that column names have the appearance of being local in j (can assign to them ok), bug #1624
DT = data.table(name=c(rep('a', 3), rep('b', 2), rep('c', 5)), flag=FALSE)
test(387, DT[,{flag[1]<-TRUE;list(flag=flag)}, by=name], DT[c(1,4,6),flag:=TRUE])
DT = data.table(score=1:10, name=c(rep('a', 4), rep('b',2), rep('c', 3), 'd'))
test(388, DT[,{ans = score[1]
score[1] <- -score[1]
ans
},by=name],
data.table(name=letters[1:4],V1=c(1L,5L,7L,10L)))
# Tests 389-394 (character grouping and sorting) now at the start of this file, so that any
# errors elsewhere show up in the last 13 lines displayed by CRAN checks.
# Test unique.data.table for numeric columns within tolerance, for consistency with
# with unique.data.frame which does this using paste.
old_rounding = getNumericRounding()
DT = data.table(a=tan(pi*(1/4 + 1:10)),b=42L)
# tan(...) from example in ?all.equal.
test(395, all.equal(DT$a, rep(1,10)))
test(396, length(unique(DT$a))>1) # 10 unique values on all CRAN machines (as of Nov 2011) other than mac (5 unique)
# commenting these two as they give different results on os x and linux.
# test(397.1, unique(DT), DT[duplicated(DT)]) # default, no rounding
# test(398.1, duplicated(DT), c(FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE))
setNumericRounding(2L)
test(397.2, unique(DT), DT[1]) # before v1.7.2 unique would return all 10 rows. For stability within tolerance, data.table has its own modified numeric sort.
test(398.2, duplicated(DT), c(FALSE,rep(TRUE,9)))
setNumericRounding(old_rounding)
DT = data.table(a=c(3.142, 4.2, 4.2, 3.142, 1.223, 1.223), b=rep(1,6))
test(399, unique(DT), DT[c(1,2,5)])
test(400, duplicated(DT), c(FALSE,FALSE,TRUE,TRUE,FALSE,TRUE))
DT[c(2,4,5),a:=NA]
test(401, unique(DT), DT[c(1,2,3,6)])
test(402, duplicated(DT), c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE))
# Test NULL columns next to non-NULL, #1633
DT = data.table(a=1:3,b=4:6)
test(403, DT[,list(3,if(a==2)NULL else b),by=a], data.table(a=1:3,V1=3,V2=c(4L,NA_integer_,6L)))
test(404, DT[,list(3,if(a==1)NULL else b),by=a], error="Please use a typed empty vector instead.*such as integer.*or numeric")
test(405, DT[,list(3,if(a==1)numeric() else b),by=a], error="Column 2 of result for group.*integer.*double.*types must be consistent for each group")
test(406, DT[,list(3,if(a==1)integer() else b),by=a], data.table(a=1:3,V1=3,V2=c(NA_integer_,5:6)))
# Test that first column can be list, #1640
test(407, data.table(list(1:2,3:5)), as.data.table(list(list(1:2,3:5))))
# With over-allocation, null data.table has truelength 100. Replaced the calls to structure() in the
# code to new null.data.table(), so test internal function. User may have changed default, so this
# doesn't test "100" explicitly.
test(408, null.data.table(), data.table(NULL))
test(408.5, data.table(), data.table(NULL))
# Test that adding a column using := is fully by reference rather than a shallow copy, #1646
DT = data.table(1:2,3:4) # list vector truelength 100
DT2 = DT
DT2[,y:=10L]
test(409, DT, DT2)
test(410, DT, data.table(1:2,3:4,y=10L))
DT2[1,V1:=99L]
test(411, DT, DT2)
test(412, DT, data.table(c(99L,2L),3:4,y=10L))
# Test that cbind dispatched to data.table() and retains keys
DT = data.table(x=c("a","b"),y=1:4,key="x")
test(413.1, key(cbind(DT,DT)), NULL) # key dropped because name "x" ambiguous
DT1 = data.table(z = c(1,2), w = 1:4, key = "z")
test(413.2, key(cbind(DT,DT1)), c("x", "z"))
test(413.3, key(cbind(colA=10:13, DT)), "x") # data.table() dispatched even though 1st argument isn't data.table
test(413.4, key(cbind(colA=10:17, DT)), NULL) # DT recycled so key is dropped
test(413.5, key(cbind(colA=1, DT)), "x") # DT not recycled so key retained
test(414.1, key(cbind(DT,as.data.frame(DT1))), "x")
test(414.2, cbind(as.data.frame(DT),DT1), data.frame(DT,DT1))
# cbind(DF,...) should return a data.frame for consistency with base. Package treemap (at least) depends
# on this in the return() in treepalette().
# Use data.table(DF,DT) if a data.table result is required.
# Test friendly error when := is used in wrong place
test(415, x:=1, error="defined for use in j, once only and in particular ways")
# Somehow never tested that X[Y] is error if X is unkeyed.
DT = data.table(a=1:3,b=4:6)
test(416, DT[J(2)], error="the columns to join by must be specified using")
# Test shallow copy verbose message from := adding a column, and (TO DO) only when X is NAMED.
DT = data.table(a=1:3,b=4:6)
test(417, alloc.col(DT,3,verbose=TRUE), DT, output="Attempt to reduce allocation from.*to 5 ignored. Can only increase allocation via shallow copy")
options(datatable.alloccol=1L)
DT = data.table(a=1:3,b=4:6)
options(datatable.alloccol=1024L)
DT2 = DT
test(418, length(DT)==2 && truelength(DT)==3)
DT[,c:=7L] # uses final slot
test(419, DT, DT2)
test(420, length(DT)==3 && truelength(DT)==3 && length(DT2)==3 && truelength(DT2)==3)
test(421, DT[,d:=8L,verbose=TRUE], output="Growing vector of column pointers from")
test(422, length(DT)==4)
test(423, truelength(DT), 1028L)
# Test crash bug fixed, #1656, introduced with the 1.7.0 feature
DT = data.table(a = factor(c("A", "Z")), b = 1:4)
test(424.1, DT[1,1] <- "Z", "Z")
test(424.2, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4))
test(425, DT[1,1] <- 1, 1)
test(426, DT, data.table(a=factor(c("A","Z")),b=1:4))
test(427.1, DT[1,1] <- 2L, 2L)
test(427.2, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4))
test(428, DT[1,a:="A"], data.table(a=factor(c("A","Z","A","Z")),b=1:4))
test(429, DT[1,a:=2L], data.table(a=factor(c("Z","Z","A","Z")),b=1:4))
test(430.1, DT[1,1]<-3, error="Assigning factor numbers to column 1 named 'a'. But 3.0.* is outside the level range [[]1,2[]], or is not a whole number")
test(430.2, DT[1,1]<-1.3, error="Assigning factor numbers to column 1 named 'a'. But 1.3.* is outside the level range [[]1,2[]], or is not a whole number")
test(430.3, DT[1,1:=4L], error="Assigning factor numbers.*1 named 'a'. But 4 is outside.*1,2.*")
test(430.4, DT[1,a:=TRUE], error="Cannot assign 'logical' to 'factor'. Factor columns can be assigned")
test(430.5, DT[2,b:=factor("A")], error="Cannot assign 'factor' to 'integer'. Factors can only be assigned to")
DT = data.table(a=factor(c("A","B","A","C","B")), b=1:5)
test(431.1, DT[1,1:=NA], data.table(a=factor(c(NA,"B","A","C","B")), b=1:5))
test(431.2, DT[2,1:=NA_integer_], data.table(a=factor(c(NA,NA,"A","C","B"), levels=LETTERS[1:3]), b=1:5))
test(431.3, DT[3,1:=NA_real_], data.table(a=factor(c(NA,NA,NA,"C","B"), levels=LETTERS[1:3]), b=1:5))
test(431.4, DT[4,1:=NA_character_], data.table(a=factor(c(NA,NA,NA,NA,"B"), levels=LETTERS[1:3]), b=1:5))
if (test_bit64) {
test(431.5, DT[5,1:=as.integer64(NA)], data.table(a=factor(c(NA,NA,NA,NA,NA), levels=LETTERS[1:3]), b=1:5))
}
old = getOption("datatable.alloccol") # Test that unsetting datatable.alloccol is caught, #2014
options(datatable.alloccol=NULL) # In this =NULL case, options() in R 3.0.0 returned TRUE rather than the old value. This R bug was fixed in R 3.1.1.
# This is why getOption is called first rather than just using the result of option() like elsewhere in this test file.
# TODO: simplify this test if/when R dependency >= 3.1.1
err1 = try(data.table(a=1:3), silent=TRUE)
options(datatable.alloccol="1024")
err2 = try(data.table(a=1:3), silent=TRUE)
options(datatable.alloccol=c(10L,20L))
err3 = try(data.table(a=1:3), silent=TRUE)
options(datatable.alloccol=NA_integer_)
err4 = try(data.table(a=1:3), silent=TRUE)
options(datatable.alloccol=-1)
err5 = try(data.table(a=1:3), silent=TRUE)
options(datatable.alloccol=1024L) # otherwise test() itself fails in its internals with the alloc.col error
test(432.1, inherits(err1,"try-error") && grep("Has getOption[(]'datatable.alloccol'[)] somehow become unset?", err1))
test(432.2, inherits(err2,"try-error") && grep("getOption[(]'datatable.alloccol'[)] should be a number, by default 1024. But its type is 'character'.", err2))
test(432.3, inherits(err3,"try-error") && grep("is a numeric vector ok but its length is 2. Its length should be 1.", err3))
test(432.4, inherits(err4,"try-error") && grep("It must be >=0 and not NA.", err4))
test(432.5, inherits(err5,"try-error") && grep("It must be >=0 and not NA.", err5))
# Repeat the tests but this time with subsetting, to ensure the validity check on option happens for those too
DT = data.table(a=1:3, b=4:6)
options(datatable.alloccol=NULL)
err1 = try(DT[2,], silent=TRUE)
options(datatable.alloccol="1024")
err2 = try(DT[,2], silent=TRUE)
options(datatable.alloccol=c(10L,20L))
err3 = try(DT[a>1], silent=TRUE)
options(datatable.alloccol=NA_integer_)
err4 = try(DT[,"b"], silent=TRUE)
options(datatable.alloccol=-1)
err5 = try(DT[2,"b"], silent=TRUE)
options(datatable.alloccol=1024L) # otherwise test() itself fails in its internals with the alloc.col error
test(433.1, inherits(err1,"try-error") && grep("Has getOption[(]'datatable.alloccol'[)] somehow become unset?", err1))
test(433.2, inherits(err2,"try-error") && grep("getOption[(]'datatable.alloccol'[)] should be a number, by default 1024. But its type is 'character'.", err2))
test(433.3, inherits(err3,"try-error") && grep("is a numeric vector ok but its length is 2. Its length should be 1.", err3))
test(433.4, inherits(err4,"try-error") && grep("It must be >=0 and not NA.", err4))
test(433.5, inherits(err5,"try-error") && grep("It must be >=0 and not NA.", err5))
# simple realloc test
DT = data.table(a=1:3,b=4:6)
test(434.1, truelength(DT), 1026L)
alloc.col(DT,200) # should have no affect since 200<1024
test(434.2, truelength(DT), 1026L)
DT = alloc.col(DT,2000) # test the superfluous DT =
test(434.3, truelength(DT), 2002L)
DT2 = alloc.col(DT,3000) # DT changed then DT2 pointed to it
test(434.4, truelength(DT), 3002L)
test(434.5, truelength(DT2), 3002L)
# test that alloc.col assigns from within functions too (i.e. to wherever that object is)
DT = data.table(a=1:3,b=4:6) # tl 1024 now by default
test(437.1, truelength(DT), 1026L)
f = function() {
alloc.col(DT,2042) # DT isn't local so (via inherits=TRUE) it finds in frame above.
invisible()
}
f()
test(437.2, truelength(DT), 2044L)
# quick test that [<- (not recommended) over allocates (again) after the copy of length via *tmp*
DT = data.table(a=1:3,b=4:6)
tl = truelength(DT)
DT$foo = 7L
test(438, truelength(DT), tl)
DT[,"bar"] = 8L
test(439, truelength(DT), tl+2L)
test(440, DT, data.table(a=1:3,b=4:6,foo=7L,bar=8L))
# Test rbind works by colname now, for consistency with base, FR#1634
DT = data.table(a=1:3,b=4:6)
test(441, rbind(DT,list(a=4L,b=7L)), data.table(a=1:4,b=4:7))
test(442, rbind(DT,data.frame(a=4L,b=7L)), data.table(a=1:4,b=4:7))
test(443, rbind(DT,data.table(a=4L,b=7L)), data.table(a=1:4,b=4:7))
test(444, rbind(DT,list(b=7L,a=4L)), data.table(a=1:4,b=4:7)) # rbind should by default check row names. Don't warn here. Add clear documentation instead.
test(445, rbind(DT,data.frame(b=7L,a=4L)), data.table(a=1:4,b=4:7))
test(446, rbind(DT,data.table(b=7L,a=4L)), data.table(a=1:4,b=4:7))
test(450, rbind(DT,list(c=4L,a=7L)), error=tt<-"Column 1 ['c'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns)")
test(451, rbind(DT,data.frame(c=4L,a=7L)), error=tt)
test(452, rbind(DT,data.table(c=4L,a=7L)), error=tt)
test(453, rbind(DT,list(4L,7L)), data.table(a=1:4,b=4:7))
# Test new use.names argument in 1.8.0
test(453.1, rbind(DT,list(FOO=4L,BAR=7L),use.names=FALSE), data.table(a=1:4,b=4:7))
test(453.2, rbind(DT,data.table(b=4:5,a=7:8), use.names=FALSE), data.table(a=1:5,b=4:8))
# Test the linked reported bug, #1645
A1 = data.table(b='hello', a='foo', key='a')
A2 = data.table(a=c('foo', 'bar'), key='a')
test(454, merge(A1, A2, all.y=TRUE, by='a'), data.table(a=c("bar","foo"),b=c(NA,"hello"),key="a"))
A1 = data.table(a='foo', b='hello', key='a')
test(455, merge(A1, A2, all.y=TRUE, by='a'), data.table(a=c("bar","foo"),b=c(NA,"hello"),key="a"))
# Test mixing nomatch=0 and mult="last", bug #1661
DT = data.table(id=c(1L, 2L, 2L, 3L), val=1:4, key="id")
test(456, DT[J(c(1,2,4)), mult="last", nomatch=0], data.table(id=1:2,val=c(1L,3L),key="id"))
# Test join inherited scope respexts nomatch=0, #1663
DT2 = data.table(id=c(1L,2L,4L), val2=c(11,12,14),key="id")
test(457, DT[DT2, list(val, val2), nomatch=0, by=.EACHI], data.table(id=c(1L,2L,2L),val=1:3,val2=c(11,12,12),key="id"))
# Test bysameorder edge cases, #1631
DT = data.table(a=1:3,v=4:9,key="a")
test(458, DT[,sum(v),by=list(a%%2L)], data.table(a=c(1L,0L),V1=c(26L,13L)))
test(459, DT[, list(sum(v)), list(ifelse(a == 2, NA, 1L))], data.table(ifelse=c(1L,NA_integer_),V1=c(26L,13L)))
test(460, DT[, list(sum(v)), list(ifelse(a == 2, 1, NA))], data.table(ifelse=c(NA_real_,1),V1=c(26L,13L)))
test(461, DT[,sum(v),by=a], data.table(a=1:3,V1=c(11L,13L,15L),key="a"))
# Test loading from file (which resets tl to 0 in R 2.14.0+, and unitialized random number in 2.13.2-)
f = tempfile()
save(list="DT",file=f)
load(f)
test(462, DT[,foo:=10L], data.table(a=1:3,v=4:9,foo=10L,key="a"))
unlink(f)
# Test CJ problems with v1.7.4, #1689
test(463, all(sapply(CJ(1:2,1:3),length)==6L))
DT = data.table(x=1:4,y=1:2,cnt=1L,key="x,y")
test(464, DT[CJ(1:4,1:4)]$cnt, INT(1,rep(NA,4),1,NA,NA,1,rep(NA,4),1,NA,NA))
test(465, DT[CJ(1:4,1:4), sum(cnt>0), by=.EACHI]$y, rep(1:4,4))
f1 = factor(c("READING","MATHEMATICS"))
f2 = factor(c("2010_2011","2009_2010","2008_2009"), levels=paste(2006:2010,2007:2011,sep="_"))
test(466, all(sapply(CJ(f1, f2),length)==6L))
# Test list(.SD,newcol=..) gives error with guidance
DT = data.table(a=1:2,v=3:6)
test(467, DT[,list(newcol=7L,.SD),by=a], error="use := by group instead")
# Test empty list column
DT = data.table(a=1:3,b=4:6)
test(468, DT[,foo:=list()], data.table(a=1:3,b=4:6,foo=list()))
# Test plonk list
test(469, DT[,bar:=list(1,"a",3.14)], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14)))
# Test plonk list variable (to catch deparse treating j=list() specially)
x = list(2,"b",2.718)
test(470, DT[,baz:=x], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14),baz=list(2,"b",2.718)))
# Test recycling list
DT = data.table(a=1:4,b=5:8)
test(471.1, DT[,foo:=list("a",2:3)], error="Supplied 2 items to be assigned to 4 items of column 'foo'.*recycle")
test(471.2, names(DT), c("a","b")) # mismatch length error was caught ok before adding column not after column added
test(471.3, DT[,foo:=.(list("a"))], data.table(a=1:4,b=5:8,foo=list("a","a","a","a")))
# Test recycling singleton list
if (ncol(DT)==3L) DT[,foo:=NULL] # else don't warn here under torture with skip= such that test 471 didn't run
test(472, DT[,foo:=list(list(2:3))], data.table(a=1:4,b=5:8,foo=list(2:3,2:3,2:3,2:3)))
# Test adding new column with a recycled factor, #1691
DT = data.table(a=1:4,b=5:8)
DT[,c:=factor("a")]
test(473.1, DT, data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a"))))
test(473.2, DT[,d:=factor(c("a","b"))], error="Supplied 2 items to be assigned to 4 items of column 'd'")
test(474, DT[,d:=factor(c("X"))], data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a")),d=factor(c("X","X","X","X"))))
# Test scoping error introduced at 1.6.1, unique(DT) when key column is 'x'
DT=data.table(x=c("a", "a", "b", "b"), y=c("a", "a", "b", "b"), key="x")
test(475, unique(DT, by=key(DT)), data.table(x=c("a","b"),y=c("a","b"),key="x"))
# Test character and list columns in tables with many small groups
N = 100L
DT = data.table(grp=1:(2*N),char=sample(as.hexmode(1:N),4*N,replace=TRUE),int=sample(1:N,4*N,replace=TRUE))
ans = DT[,list(p=paste(unique(char),collapse=","),
i=list(unique(int))), by=grp]
test(476, nrow(as.matrix(ans)), 2L*N)
# The as.matrix triggerd the "'getCharCE' must be called on a CHARSXP", or similar symptom of earlier corruption, before the fix in dogroups.c.
# Test that plonking from calling scope works, even after removing, and column copy via := is ok too.
DT = data.table(a=1:3)
foo = 4:6
DT[,foo:=foo]
rm(foo)
gc()
DT[,foo2:=foo]
DT[2,foo:=10L]
DT[3,foo2:=11L]
gc()
test(477, DT, data.table(a=1:3,foo=c(4L,10L,6L),foo2=c(4L,5L,11L)))
test(478, DT[,foo:=foo], DT) # does nothing, with no warning, consistent with base R `a<-a`.
# Old tests that recycling now works now moved to test error
DT = data.table(x=1:4)
test(479, DT[, a:=5:7], error="Supplied 3 items to be assigned to 4 items of column 'a'")
# Test that multiple columns can be added
DT = data.table(x=1:4)
test(481, DT[, c("foo","bar"):=list(10L,11:14)], data.table(x=1:4,foo=10L,bar=11:14))
# and combined with update and add in one step
test(482, DT[, c("foo","baz"):=list(12L,15:18)], data.table(x=1:4,foo=12L,bar=11:14,baz=15:18))
# Test that errors in := do not leave DT in bad state, #1711
DT = data.table(x=1:4)
test(483.1, DT[,c("foo","bar"):=list(20L,stop('user error'))], error="user error")
test(483.2, DT, data.table(x=1:4)) # i.e. DT as it was before, without foo being added as it did in v1.7.7-
# The test used to be as follows but as from v1.9.8, the empty numeric() now works and creates a NA_real_ column
test(484, DT[,c("foo","bar"):=list(20L,numeric())], data.table(x=1:4, foo=20L, bar=NA_real_))
# Test i's key longer than x's
d1 <- data.table(a=1:2, b=11:14, key="a,b")
d2 <- data.table(A=0:1, B=1:4, key="A")
test(485, d2[d1, allow.cartesian=TRUE], data.table(A=INT(1,1,1,1,2,2),B=INT(2,4,2,4,NA,NA),b=INT(11,11,13,13,12,14),key="A"))
test(486, d2[d1,sum(B),by=.EACHI], data.table(A=INT(1,1,2,2),V1=INT(6,6,NA,NA),key="A")) # no allow.cartesian needed due to by-without-by
# Test base R's reshape. There is no reshape() function in package:reshape.
DT <- data.table(ID=rep(1:3, each=3), TIME=rep(1:3, 3), X=1:9)
test(487, data.table(stats::reshape(DT, idvar="ID", timevar="TIME", direction="wide")),
data.table(ID=1:3,X.1=INT(1,4,7),X.2=INT(2,5,8),X.3=INT(3,6,9)))
# The data.table() around reshape above is to drop reshape's attributes.
DT <- data.table(ID=rep(1:3, each=3), TIME=rep(1:3, 3), X=1:9, Y=10:18)
test(488, data.table(stats::reshape(DT, idvar="ID", timevar="TIME", direction="wide")),
data.table(ID=1:3,X.1=INT(1,4,7),Y.1=INT(10,13,16),X.2=INT(2,5,8),Y.2=INT(11,14,17),X.3=INT(3,6,9),Y.3=INT(12,15,18)))
# Test warnings for names<- and colnames<-, but only warnings when caller is data.table aware.
DT = data.table(a=1:3,b=4:6)
test(489, names(DT)[1]<-"A", "A") # needs R >= 3.1.0, which is stated dependency now
test(490, names(DT), c("A","b"))
test(491, colnames(DT)[2]<-"B", "B")
test(492, names(DT), c("A","B"))
# Check setnames out of bounds errors
test(493, setnames(DT,"foo","bar"), error="not found.*foo")
test(494, setnames(DT,3,"bar"), error="NA (or out of bounds) in 'old' at positions [1]")
# Test setcolorder() and allowance of length(neworder)<length(x) in v1.10.5 (#592)
DT = data.table(a=1:2,b=3:4,c=5:6)
test(495.1, setcolorder(DT,c(2,1,3)), data.table(b=3:4,a=1:2,c=5:6))
test(495.2, setcolorder(DT,c(2,1,3)), data.table(a=1:2,b=3:4,c=5:6))
test(496, setcolorder(DT,c("c","a","b")), data.table(c=5:6,a=1:2,b=3:4))
test(497, setcolorder(DT,c("d","a","b")), error="specify non existing column*.*d")
DT = data.table(a = 1:3, b = 2:4, c = 3:5)
test(498.1, names(setcolorder(DT, "b")), c("b", "a", "c"))
test(498.2, names(setcolorder(DT, c(2, 3))), c("a", "c", "b"))
test(498.3, setcolorder(DT, 1:4), error = "specify non existing column*.*4")
# Test where neworder=NULL, thus ordered by key and index columns
DT = data.table(a = 1:3, b = 2:4, c = 3:5, d = 4:6, key="b")
test(498.4, names(setcolorder(DT)), c("b", "a", "c", "d"))
# test first group listens to nomatch when j uses join inherited scope.
x <- data.table(x=c(1,3,8),x1=10:12, key="x")
y <- data.table(x=c(3,8,10),y1=10:12, key="x")
test(499, y[x,x1,nomatch=0,by=.EACHI], data.table(x=c(3,8),x1=11:12, key="x"))
test(500, y[x,x1,nomatch=NA,by=.EACHI], data.table(x=c(1,3,8),x1=10:12, key="x"))
# Test merge bug of unkeyed tables introduced in 1.6.8 and 1.6.9 reported by Eric, and ...
dt1 <- data.table(l = factor(c("a","b","a","b")))
dt2 <- data.table(l = factor(c("a","b")), L = factor(c("A","B")))
test(501, setkey(merge(dt1,dt2,by="l"),NULL), as.data.table(merge(as.data.frame(dt1), as.data.frame(dt2), by="l")))
dt1 <- data.table(l = c("a","b","a","b"))
dt2 <- data.table(l = c("a","b"), L = c("A","B"))
test(501.5, setkey(merge(dt1,dt2,by="l"),NULL), as.data.table(merge(as.data.frame(dt1), as.data.frame(dt2), by="l")))
# ... similar example from DM
dtA = data.table(i = 1:8, j = rep(1:2, 4), k = rep(1:4, 2), A = 10:17)
dtB = data.table(j = rep(1:2, 2), k = 1:4, B = 18:21)
test(502, merge(dtA, dtB, by = c("j","k"), all.x = TRUE),
data.table(j=rep(1:2,each=4), k=rep(INT(1,3,2,4),each=2), i=INT(1,5,3,7,2,6,4,8),
A=INT(10,14,12,16,11,15,13,17), B=rep(INT(18,20,19,21),each=2), key="j,k"))
test(503, dtA$i, 1:8) # check that merge didn't change the order of dtA by reference
test(504, dtB$k, 1:4) # or dtB
# Test new i. JIS prefix in 1.7.10
DT = data.table(a=1:2,b=1:4,key="a")
test(505, DT[J(a=1,b=6),sum(i.b*b),by=.EACHI]$V1, 24) # 24 now 'double' because i.b is 'double'
# Test := after a key<-
DT = data.table(a=3:1,b=4:6)
test(506, key(DT)<-"a", "a", warning="deprecated")
test(508, DT, data.table(a=1:3,b=6:4,key="a"))
test(509, DT[,b:=10L], data.table(a=1:3,b=10L,key="a"))
test(510, DT[,c:=11L], data.table(a=1:3,b=10L,c=11L,key="a"), # no warning between 1.8.3 and 1.12.2 due to (now removed) setmutable and SET_NAMED in setalloccol, #3729
warning="Invalid .internal.selfref detected and fixed") # but the warning makes sense after the (deprecated) key(DT)<- above, so this warns again from 1.12.4
# Test new functons chmatch and %chin%
y=letters
x=c(sample(letters,12),"foo","bar")
test(512, chmatch(x,y), match(x,y))
test(513, chmatch(x,y,nomatch=0), match(x,y,nomatch=0))
test(514, x %chin% y, x %in% y)
# Test new function set() in v1.8.0
DT = data.table(a=1:3,b=4:6)
test(515, set(DT,2,1,3), data.table(a=c(1L,3L,3L),b=4:6), warning=c("Coerced i from numeric to integer","Coerced j from numeric to integer"))
test(516, set(DT,"2",1,3), error="i is type 'character'")
test(517, set(DT,2L,1,3), DT, warning="Coerced j")
# FR #2551 implemented - removed warning from 518
# test(518, set(DT,2L,1L,3), DT, warning="Coerced 'double' RHS to 'integer'")
test(518, set(DT,2L,1L,3), DT)
test(519, set(DT,2L,1L,3L), data.table(a=INT(1,3,3),b=4:6))
test(520, set(DT,2L,"a",2L), data.table(a=1:3,b=4:6))
test(521, set(DT,2:3,"b",7:8), data.table(a=1:3,b=INT(4,7,8)))
test(522, set(DT,2L,"foo",7L), data.table(a=1:3,b=INT(4,7,8), foo=INT(NA,7,NA))) # error="foo.*is not a column name[.] Cannot add columns with set.*use := instead")
test(523, set(DT,2L,c("a","a"),list(9L,10L)), error="Can't assign to the same column twice in the same query (duplicates detected).")
test(523.1, set(DT,2L,"a",10L), data.table(a=INT(1,10,3),b=INT(4,7,8), foo=INT(NA,7,NA)))
setkey(DT,b)
test(524, set(DT,2L,"a",2L), data.table(a=1:3, b=INT(4,7,8), foo=INT(NA,7,NA), key="b"))
test(525, set(DT,1L,"b",6L), data.table(a=1:3, b=6:8, foo=INT(NA,7,NA)))
test(525.1, set(DT,j="b",value=9:11), data.table(a=1:3, b=9:11, foo=INT(NA,7,NA))) # plonk syntax via missing i (fixed in 1.8.1)
test(525.2, set(DT,NULL,"b",12:14), data.table(a=1:3, b=12:14, foo=INT(NA,7,NA))) # plonk syntax via NULL i
# NEW ADDITIONAL TESTS FOR set() - bug #2077 - for using set to add columns by reference
# were tests 1096.1-4
DT1 <- data.table(x = 1, y = 1:10, fac = sample(LETTERS[1:3], 10, replace = TRUE)) # from SO
DT2 <- copy(DT1)
mul=c(5.3,2.8)
for (j in seq_along(mul)) set(DT1, i=NULL, j=paste("dot", j, sep=""), mul[j]*DT1[[j]])
DT2[, `:=`(dot1=5.3*x, dot2=2.8*y)]
test(529.1, DT1, DT2)
set(DT1, i=NULL, j="dot2", value=NULL) # remove "dot2"
test(529.2, DT1, DT2[, list(x,y,fac, dot1)])
DT2[, dot2 := NULL][5:9, `:=`(bla1 = 0L, x = 3L, bla2 = 2L)]
set(DT1, i=5:9, j=c("bla1", "x", "bla2"), value=list(0L, 3L, 2L))
test(529.3, DT1, DT2) # more testing with many columns including existing columns
test(529.4, set(DT1, i=NULL, j=7L, value=5L), error="Item 1 of column numbers in j is 7 which is outside range.*1.*6.*Use column names instead in j to add new columns.")
# Test that data.frame incompability is fixed, came to light in Feb 2012
DT = data.table(name=c('a','b','c'), value=1:3)
test(530, base::droplevels(DT[ name != 'a' ]), data.table(name=c('b','c'),value=2:3)) # base:: because we'll implement a fast droplevels, too.
# Test that .set_row_names() is maintained on .SD for each group
DT = data.table(a=INT(1,1,2,2,2,3,3,3,3),b=1:9)
test(531, DT[,length(rownames(.SD)),by=a], data.table(a=1:3,V1=2:4))
# Test column names with spaces, bug#1880, and check.names default is now FALSE, too
# Thanks to Yang Zhang for the tests.
DT = data.table("a b"=INT(1,1,2,2,2),c=1:5)
test(532, DT[,sum(c),by="a b"], data.table("a b"=1:2,V1=c(3L,12L)))
test(533, names(data.table('a b'=1)[, list('c d'=`a b`)]), "c d")
test(534, names(transform(data.table('a b'=1), `c d`=`a b`)), c("a b","c d"))
# Test keyby, new in v1.8.0
DT = data.table(a=INT(1,3,1,2,3,2),b=1:2,c=1:3,v=1:6)
test(535, DT[,sum(v),by=a, keyby=a], error="not both")
test(536, DT[,sum(v),by=a], data.table(a=c(1L,3L,2L),V1=c(4L,7L,10L))) # retains appearance order
ans = data.table(a=1:3,V1=c(4L,10L,7L),key="a")
test(537, DT[,sum(v),keyby=a], ans)
test(538, DT[,sum(v),keyby="a"], ans)
var="a"
test(539, DT[,sum(v),keyby=eval(var)], ans)
a=quote(a%%2L)
test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L)))
test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a"))
test(542, DT[,sum(v),keyby=c("a","b","c")]$V1, INT(1,3,4,6,5,2))
test(543, DT[,sum(v),keyby="a,b,c"]$V1, INT(1,3,4,6,5,2))
test(544, DT[,sum(v),keyby=c("a","b,c")], error="but one or more items include a comma")
# Test single expressions passed to by, FR#1743 in v1.8.0
DT = data.table(a=1:4,date=as.IDate("2012-02-28")+0:3,v=5:8)
test(545, DT[,sum(v),by=a%%2L], data.table(a=1:0,V1=c(12L,14L)))
test(546, DT[,sum(v),by=month(date)], data.table(month=2:3,V1=c(11L,15L)))
# Test that factor levels no longer need to be sorted, and that 'ordered' class is retained.
# Posted by Allan Engelhardt ...
x = factor(LETTERS[1:3], levels=rev(LETTERS), ordered=TRUE)
DT = data.table(A=x, B=x, v=1:3, key="A")
test(547,is.ordered(DT$A) && is.ordered(DT$B))
test(548, DT["A",v,verbose=TRUE], 1L, output="Matching character column i.V1 to factor levels in x.A")
# Posted by Damian Betebenner ...
set.seed(123)
my.course.sample = sample(1:5, 10, replace=TRUE)
Y = factor(my.course.sample, levels=1:5, labels=c("Basic Math", "Calculus", "Geometry", "Algebra I", "Algebra II"))
DT = data.table(ID=1:10, COURSE=Y)
test(549, DT[,sum(ID),by=COURSE]$V1, INT(1,2,29,17,6))
setkey(DT, COURSE)
test(550, DT[,sum(ID),by=key(DT)]$V1, INT(6,1,29,2,17))
# Another test of DT[i] syntax from datatable-unaware packages, #1794 from ilprincipe.
DF = structure(list(sample = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("panel.1yr", "panel.2yr", "panel.3yr", "panel.inc", "pre.inc", "pre.prev", "post.inc", "post.prev"), class = "factor"), base = c(2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2002), ref = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("2004", "2002-2004", "2001", "2000", "2009", "2008"), class = "factor"), var = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("distance", "time"), class = "factor"), treated = c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1), distance = c(10000, 30000, 50000, 1e+05, 10000, 30000, 50000, 1e+05, 10000, 30000, 50000, 1e+05, 10000, 30000, 50000, 1e+05), all = c(602L, 6357L, 8528L, 9272L, 435L, 2438L, 3456L, 6360L, 245L, 2693L, 3699L, 4084L, 187L, 983L, 1400L, 2660L), di.recip = c(5L, 39L, 57L, 62L, 4L, 16L, 22L, 45L, 2L, 25L, 36L, 37L, 1L, 11L, 16L, 35L), irr = c(0.00830564784053156, 0.00613496932515337, 0.00668386491557223, 0.00668679896462468, 0.00919540229885057, 0.00656275635767022, 0.00636574074074074, 0.00707547169811321, 0.00816326530612245, 0.00928332714444857, 0.0097323600973236, 0.00905974534769833, 0.0053475935828877, 0.0111902339776195, 0.0114285714285714, 0.0131578947368421)), .Names = c("sample", "base", "ref", "var", "treated", "distance", "all", "di.recip", "irr"), row.names = c(NA, 16L), class = "data.frame")
DT = as.data.table(DF)
test(551, nrow(stats::reshape(DT, v.names = c("all", "di.recip", "irr"),
timevar = "treated", idvar = c("sample", "var", "distance"),
direction = "wide" )), 8L)
# Test bug report #1275 from S Bagley :
DT = data.table(a=c("1","1"), b=c(2,2))
test(552, is.character(DT$a))
test(553, unique(DT), data.table(a="1",b=2))
# Test bug #1726 from Ivan Zhang.
DT = data.table(V1=c('a', 'b', 'a'), V2 = c('hello', 'ello', 'llo'))
test(554, nrow(DT[V1=='a' & V2 %like% 'll']), 2L)
test(555, nrow(DT[V1=='a' & V2 %like% 'ello']), 1L)
# Test can't := to .SD, #1727
DT = data.table(x = 1:5, y = rnorm(5))
test(556, DT[,.SD[,z:=rnorm(1)],by=x], error="[.]SD is locked.*reserved for possible future use")
f = function(.SD) .SD[,z:=rnorm(1)]
test(557, DT[, f(.SD), by=x], error="[.]SD is locked.*reserved for possible future use")
# Test printing on nested data.table, bug #1803
DT = data.table(x=letters[1:3], y=list(1:10, letters[1:4], data.table(a=1:3,b=4:6)))
test(558, capture.output(print(DT)),
c(" x y", "1: a 1,2,3,4,5,6,...", "2: b a,b,c,d", "3: c <data.table[3x2]>"))
test(559, setkey(DT,x)["a",y][[1]], 1:10) # y is symbol representing list column, specially detected in dogroups
# Test renaming of .N to N
DT = data.table(a=INT(1,1,2,2,2),b=INT(1,2,2,2,1))
test(560.1, DT[,.N,a][,.N], 2L)
test(560.2, DT[,.N,a][,N], 2:3)
test(561, DT[,.N,a][,N], 2:3)
test(562, DT[,list(.N),a][,N], 2:3)
test(563, DT[,.N,a][,unique(.N),a]$V1, c(1L,1L))
test(564, DT[,.N,a][,unique(N),a]$V1, 2:3)
test(565, DT[,.N,a][N>2], data.table(a=2L, N=3L))
test(566, DT[,list(.N=.N),a][.N>2], data.table(a=2L,.N=3L))
test(567, DT[,.N,list(a,b)][,N,by=a]$N, c(1L,1L,2L,1L))
test(568, DT[,.N,list(a,b)][,unique(N),by=a]$V1, c(1L,2L,1L))
test(569, DT[,list(.N=.N),list(a,b)][,.N,a], error="The column '.N' can't be grouped because")
test(570, DT[,list(.N=.N),list(a,b)][,unique(.N),a], error="The column '.N' can't be grouped because")
test(570.1, DT[,list(.I=.I),list(a,b)][,.I,a], error="The column '.I' can't be grouped because")
# Test spaces in by="..." format, datatable-help on 31 March
DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6)
test(571, DT[,sum(v),by="b, b"], data.table("b"=3:4, " b"=5:6, V1=c(9L,12L)))
test(572, DT[,sum(v),by="a , b"], data.table("a "=1:2, " b"=5:6, V1=c(9L,12L)))
test(573, DT[,sum(v),by="b, a"], error="object ' a' not found")
# Test base::unname, used by melt, and only supported by data.table for DF compatibility for non-dtaware packages
DT = data.table(a=1:3, b=4:6)
test(574, dim(unname(DT)), 3:2)
# Test that CJ retains explicit names (useful if used independently)
test(575, CJ(x=c(1L,2L), y=c("a","b")), data.table(x=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="x,y"))
test(576, CJ(c(1L,2L), y=c("a","b")), data.table(V1=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="V1,y"))
test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","a","b"),key="x,V2"))
# Test factor to character join when factor contains unused and reverse order levels :
X = data.table(a=LETTERS[1:4],v=1:4,key="a")
Y = data.table(a=factor(c("D","B"),levels=rev(LETTERS)),key="a")
test(578, X[Y,verbose=TRUE], data.table(a=c("D","B"), v=c(4L,2L)), # no key because "D">"B", consistent with v1.12.2 and before
output="Coercing factor column i.a to type character to match type of x.a")
# Test that logical i in set() returns helpful error
DT = data.table(a=1:3,b=4:6)
test(580, set(DT,a<3,"b",0L), error="simply wrap with which(), and take the which() outside the loop if possible for efficiency")
# Test by on empty tables (and when i returns no rows), #1945
DT = data.table(a=1:3,v=1:6)
test(581, DT[a<1,sum(v),by=a], data.table(a=integer(),V1=integer()))
test(582, DT[a<1,sum(v),by=list(a)], data.table(a=integer(),V1=integer()))
test(583, DT[a<1], DT[0])
test(584, DT[a<1], output="Empty data.table (0 rows and 2 cols): a,v")
test(585, DT[a<1,list(v)], output="Empty data.table (0 rows and 1 cols): v")
test(586.1, data.table(a=integer(),V1=integer()), output="Empty data.table (0 rows and 2 cols): a,V1")
env = environment()
data(iris, package='datasets', envir = env) # in case user has edited iris in their session
test(586.2, print.data.table(iris[,FALSE]), output="Empty data.frame (150 rows and 0 cols)") #3363
# Test that .N is available in by on empty table, also in #1945
test(587, DT[a<1,list(sum(v),.N),by=a], data.table(a=integer(),V1=integer(),N=integer()))
# Realised that DT[NULL] returned an error.
test(588, DT[NULL], data.table(NULL))
# Test that .N, .SD and .BY are available when by is missing and when by is 0 length
DT = data.table(x=rep(1:3,each=3), y=c(1,3,6), v=1:9)
test(589, DT[,sapply(.SD,sum)*.N], c(x=162, y=270, v=405))
test(590, DT[,sapply(.SD,sum)*.N,by=NULL], data.table(V1=c(162,270,405)))
test(591, DT[,sapply(.SD,sum)*.N,by=character()], data.table(V1=c(162,270,405)))
test(592, DT[,sapply(.SD,sum)*.N,by=""], data.table(V1=c(162,270,405)))
test(593, DT[,lapply(.SD,sum)], data.table(x=18L, y=30, v=45L)) # bug fix #2263 in v1.8.3: now data.table result for consistency
test(594, DT[,lapply(.SD,sum),by=NULL], data.table(x=18L, y=30, v=45L))
test(595, DT[,lapply(.SD,sum),by=character()], data.table(x=18L, y=30, v=45L))
test(596, DT[,lapply(.SD,sum),by=""], data.table(x=18L, y=30, v=45L))
# Test keys of two numeric columns, bug#2004
DT = data.table(x=0.0,y=c(0.0,0.1,0.0,0.2,0.0))
test(597, unique(DT), DT[c(1,2,4)])
test(598, DT[,list(count=.N),by=c("x","y")], data.table(x=0.0,y=c(0.0,0.1,0.2),count=c(3L,1L,1L)))
# And that numeric NAs sort stably to the beginning. Whether NAs are allowed in keys, another issue but
DT = data.table( c(1.34, 1.34, 1.34, NA, 2.22, 2.22, 1.34, NA, NA, 1.34, 0.999), c(75.1, NA, 75.1, 75.1, 2.3, 2.4, 2.5, NA, 1.1, NA, 7.9 ))
test(599, DT[c(8,9,4,11,2,10,7,1,3,5,6)], setkey(setkey(DT),NULL))
set.seed(1)
DT = data.table(x=rep(c(1,2), each=10), y=rnorm(20))
setkey(DT, x, y)
test(600, is.sorted(DT$x))
test(601, !is.sorted(DT$y))
test(602, base::order(DT$x,DT$y), 1:20)
## #2331 test that repeated setting of key works
test(602.1, setkey(copy(DT), x, y), DT)
test(602.2, setkey(copy(DT), x), {setkey(DT, NULL); setkey(DT, x)})
# Crash bug of chorder(character()), #2026
test(609, chorder(character()), base::order(character()))
test(610, chorder(""), base::order(""))
# Extra tests of chorder and chgroup
x = sample(LETTERS)
test(610.1, chorder(x), base::order(x))
test(610.2, chgroup(x), seq_along(x))
x = sample(LETTERS,1000,replace=TRUE)
test(610.3, chorder(x), base::order(x))
test(610.4, unique(x[chgroup(x)]), unique(x))
# := by group
DT = data.table(a=1:3,b=(1:9)/10)
test(611, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8)))
setkey(DT,a)
test(612, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a"))
# Assign to subset ok (NA initialized in the other items) ok :
test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3))
test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3))
test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3))
# Combining := by group with i
test(616, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6)))
test(617, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3))
# Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759
ans = copy(DT)[,r:=NA_real_]
test(618, copy(DT)[a>3,r:=sum(b)], ans)
test(619, copy(DT)[J(-1),r:=sum(b)], ans)
test(620.1, copy(DT)[NA,r:=sum(b)], ans)
test(620.2, copy(DT)[0,r:=sum(b)], ans)
test(620.3, copy(DT)[NULL,r:=sum(b)], null.data.table())
DT = data.table(x=letters, key="x")
test(621, copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained
test(622, copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch")
set.seed(2)
DT = data.table(a=rnorm(5)*10, b=1:5)
test(623, DT[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L))
# Tests on POSIXct attributes
DT = data.table(a=c(1,1,2,2,2))
test(624, attributes(DT[,as.POSIXct("2011-12-13 18:50",tz="EST"),by=a][[2]]), list(class=c("POSIXct","POSIXt"),tzone="EST"))
DT = data.table(x = rnorm(5))
DT$time1 <- Sys.time() # recycle via *tmp*
DT$time2 <- rep(Sys.time(), 5) # plonk via *tmp*
DT[,time3:=Sys.time()] # recycle
DT[,time4:=rep(Sys.time(),5)] # plonk
test(625, all(sapply(DT,is,"POSIXct")[-1]))
# unique on ITime doesn't lose attributes, #1719
t = as.ITime(strptime(c("09:10:00","09:11:00","09:11:00","09:12:00"),"%H:%M:%S"))
test(626, unique(t), t[c(1,2,4)])
test(627, class(unique(t)), "ITime")
# Test recycling list() rbind; #524. This was commented out until v1.12.2 when it was reinstated in PR#3455
test(628.1, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))),
data.table(a=c(1:3,rep(4L,3L)),b=c(5:7,rep(8L,3L)),c=list(1:2,1:3,1:4,1L,2L,3L)))
# Test switch in .rbind.data.table for factor columns
test(628.2, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list(4L,factor("d"),factor("bar"))),
data.table(a=1:4,b=factor(letters[1:4]),c=factor(c(rep("foo",3),"bar"), levels = c("foo", "bar"))))
# Test merge with common names and all.y=TRUE, #2011
DT1 = data.table(a=c(1,3,4,5), total=c(2,1,3,1), key="a")
DT2 = data.table(a=c(2,3,5), total=c(5,1,2), key="a")
# 629+630 worked before anyway. 631+632 test the bug fix.
adf=as.data.frame
adt=as.data.table
test(629, merge(DT1,DT2), data.table(a=c(3,5),total.x=c(1,1),total.y=c(1,2),key="a"))
test(629.1, merge(DT1,DT2), setkey(adt(merge(adf(DT1),adf(DT2),by="a")),a))
test(630, merge(DT1,DT2,all.x=TRUE), data.table(a=c(1,3,4,5),total.x=c(2,1,3,1),total.y=c(NA,1,NA,2),key="a"))
test(630.1, merge(DT1,DT2,all.x=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.x=TRUE)),a))
test(631, merge(DT1,DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=c(NA,1,1),total.y=c(5,1,2),key="a"))
test(631.1, merge(DT1,DT2,all.y=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.y=TRUE)),a))
test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3,1),total.y=c(NA,5,1,NA,2),key="a"))
test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a))
# Test that with=FALSE by number isn't messed up by dup column names, #2025
DT = data.table(a=1:3,a=4:6)
test(634, DT[,2:=200L], data.table(a=1:3,a=200L))
# Test names when not all items are named, #2029
DT = data.table(x=1:3,y=1:3)
test(635, names(DT[,list(x,y,a=y)]), c("x","y","a"))
test(636, names(DT[,list(x,a=y)]), c("x","a"))
# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too.
set.seed(1)
DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a")
test(637, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a"))
test(638, key(DT[J(43L),a:=99L]), NULL)
setkey(DT,a)
test(639, key(DT[,a:=99L,by=a]), NULL)
# Test printing is right aligned without quotes etc, and rownames are repeated ok for more than 20 rows
DT=data.table(a=8:10,b=c("xy","x","xyz"),c=c(1.1,22.1,0))
test(640, capture.output(print(DT,class=FALSE)), c(" a b c","1: 8 xy 1.1","2: 9 x 22.1","3: 10 xyz 0.0"))
DT=data.table(a=letters,b=1:26)
test(641, tail(capture.output(print(DT[1:20], class=FALSE)),2), c("19: s 19","20: t 20"))
test(642, tail(capture.output(print(DT[1:21], class=FALSE, nrows=100)),2), c("21: u 21"," a b"))
DT=data.table(a=as.character(as.hexmode(1:500)), b=1:500)
test(643, capture.output(print(DT, class=FALSE)), c(" a b"," 1: 001 1"," 2: 002 2"," 3: 003 3"," 4: 004 4"," 5: 005 5"," --- ","496: 1f0 496","497: 1f1 497","498: 1f2 498","499: 1f3 499","500: 1f4 500"))
# Test inconsistent length of columns error.
DT = list(a=3:1,b=4:3)
setattr(DT,"class",c("data.table","data.frame"))
test(644, setkey(DT,a), error="Column 2 is length 2 which differs from length of column 1 (3)")
test(645, setkey(DT,b), error="Column 2 is length 2 which differs from length of column 1 (3)")
# Test faster mean with a lot of very small groups. Example from (now not needed as much) data.table wiki point 3.
# benchmarks.Rraw contains the same, to be scaled up.
set.seed(9)
n=1e4 # very small n so as not to overload daily CRAN checks.
DT=data.table(grp1=sample(1:150, n, replace=TRUE),
grp2=sample(1:150, n, replace=TRUE),
x=rnorm(n),
y=rnorm(n))
DT[c(2,5),x:=NA] # seed chosen to get a group of size 2 and 3 in the first 5 to easily inspect.
DT[c(3,4),y:=NA]
ans1 = DT[,list(mean(x),mean(y)),by=list(grp1,grp2)]
ans2 = DT[,list(.Internal(mean(x)),.Internal(mean(y))),by=list(grp1,grp2)]
basemean = base::mean # to isolate time of `::` itself
ans3 = DT[,list(basemean(x),basemean(y)),by=list(grp1,grp2)]
test(646, ans1, ans2)
test(647, ans1, ans3)
if (test_longdouble) {
test(648, any(is.na(ans1$V1)) && !any(is.nan(ans1$V1)))
# used to error with `valgrind` because of the 'long double' usage in gsumm.c (although I wonder if we need long double precision).
# it doesn't seem to error under valgrind anymore so the test_longdouble may be removable
# http://valgrind.org/docs/manual/manual-core.html#manual-core.limits
# http://comments.gmane.org/gmane.comp.debugging.valgrind/10340
}
ans1 = DT[,list(mean(x,na.rm=TRUE),mean(y,na.rm=TRUE)),by=list(grp1,grp2)]
ans2 = DT[,list(mean.default(x,na.rm=TRUE),mean.default(y,na.rm=TRUE)),by=list(grp1,grp2)]
test(651, ans1, ans2)
test(652, any(is.nan(ans1$V1)))
# See FR#2067. Here we're just testing the optimization of mean and lapply, should be comparable to above
ans2 = DT[,lapply(.SD,mean,na.rm=TRUE),by=list(grp1,grp2)]
setnames(ans2,"x","V1")
setnames(ans2,"y","V2")
test(654, ans1, ans2)
options(datatable.optimize = 0L)
test(656.1, DT[ , mean(x), by=grp1, verbose=TRUE], output='(GForce FALSE)')
test(656.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="(GForce FALSE)")
test(656.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="(GForce FALSE)")
options(datatable.optimize = 1L)
test(657.1, DT[ , mean(x), by=grp1, verbose=TRUE], output='(GForce FALSE)')
test(657.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="(GForce FALSE)")
test(657.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="(GForce FALSE)")
options(datatable.optimize = 2L)
test(658.1, DT[ , mean(x), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean")
test(658.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean")
test(658.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean")
tt = capture.output(DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose=TRUE])
test(659, !length(grep("Wrote less rows", tt))) # first group is one row with this seed. Ensure we treat this as aggregate case rather than allocate too many rows.
# Test .N for logical i subset
DT = data.table(a=1:10, b=rnorm(10))
test(660, DT[a==8L, .N], 1L)
# Test that growing is sensible in worst case
DT = data.table(a=rep(1:10,1:10),b=rnorm(55))
tt = capture.output(DT[,sum(b)*b,by=a,verbose=TRUE])
test(661, length(grep("growing from",tt))<3) # was 6 when we simply grew enough for latest result
# Test that adding a new logical column is supported, #2094
DT=data.table(a=1:3)
test(662, DT[,newcol:=NA], data.table(a=1:3,newcol=NA))
test(663, sapply(DT,class), c(a="integer",newcol="logical"))
# Test that setting names in the presence of dups is ok, #2103
DT = data.table(a=1:3, b=2:4, a=3:5)
test(664, setnames(DT, c('d','e','f')), data.table(d=1:3,e=2:4,f=3:5))
# Test by=c(...) in combination with i subset, #2078
DT = data.table(a=1:3,b=1:6,key="a")
test(665, DT[a<3,sum(b),by=c("a"),verbose=TRUE], DT[a<3,sum(b),by="a"], output="i clause present and columns used in by detected")
test(666, DT[a<3,sum(b),by=key(DT),verbose=TRUE], DT[a<3,sum(b),by=a], output="i clause present and columns used in by detected")
test(667, DT[a<3,sum(b),by=paste("a")], error='Otherwise, by=eval(paste("a")) should work')
test(668, DT[a<3,sum(b),by=eval(paste("a"))], DT[a<3,sum(b),by=a])
test(669, DT[a<3,sum(b),by=c(2)], error="must evaluate to 'character'")
# Test := keyby does setkey, #2065
DT = data.table(x=1:2, y=1:6)
ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x")
test(670, DT[,z:=sum(y),keyby=x], ans)
DT = data.table(x=1:2, y=1:6)
test(671, DT[,z:=sum(y),keyby="x"], ans)
DT = data.table(x=1:2, y=1:6)
test(672, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)),
warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=")
DT = data.table(x=1:2, y=1:6)
test(673, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)))
DT = data.table(x=1:2, y=1:6)
test(674, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since")
# Test new .()
DT = data.table(x=1:2, y=1:6, key="x")
test(675, DT[.(1L)], DT[1:3])
# Test new rbindlist
l = list(data.table(a=1:2, b=7:8),
data.table(a=3:4, baz=9:10),
data.table(foo=5:6, bar=11:12),
data.table(b=13:14),
list(15:16,17L),
list(c(18,19),20:21))
test(676.1, rbindlist(l[1:3]), ans<-data.table(a=1:6,b=7:12), message="Column 2 [[]'baz'[]] of item 2 is missing in item 1.*Use fill=TRUE.*or use.names=FALSE")
test(676.2, rbindlist(l[1:3], use.names=FALSE), ans)
test(677.1, rbindlist(l[c(10,1,10,2,10)]), ans<-data.table(a=1:4,b=7:10), message="Column 2 [[]'baz'[]] of item 4 is missing in item 2") # NULL items ignored
test(677.2, rbindlist(l[c(10,1,10,2,10)], use.names=FALSE), ans)
test(678, rbindlist(l[c(1,4)]), error="Item 2 has 1 columns, inconsistent with item 1 which has 2")
test(679.1, rbindlist(l[c(1:2,5)]), ans<-data.table(a=c(1:4,15:16), b=c(7:10,17L,17L)), message="Column 2 [[]'baz'[]] of item 2 is missing in item 1")
test(679.2, rbindlist(l[c(1:2,5)], use.names=FALSE), ans)
test(680, rbindlist(l[c(2,6)]), data.table(a=c(3,4,18,19), baz=c(9:10,20:21))) # coerces 18 and 19 to numeric
test(681, rbindlist(list(data.table(a=letters[1:2],b=c(1.2,1.3),c=1:2), list("c",1.4,3L), NULL, list(letters[4:6],c(1.5,1.6,1.7),4:6))),
data.table(a=letters[1:6], b=seq(1.2,1.7,by=0.1), c=1:6))
test(682, rbindlist(NULL), data.table(NULL))
test(683, rbindlist(list()), data.table(NULL))
test(684, rbindlist(list(NULL)), data.table(NULL))
test(685, rbindlist(list(data.table(NULL))), data.table(NULL))
# Test merge when no overlap of data in by columns when all=TRUE, #2114
DF1=data.frame(foo=letters[1:5], bar=1:5, stringsAsFactors=FALSE)
DF2=data.frame(foo=letters[6:10], baz=6:10, stringsAsFactors=FALSE)
DT1=as.data.table(DF1)
DT2=as.data.table(DF2)
test(686, merge(DF1, DF2, by="foo", all=TRUE), as.data.frame(merge(DT1,DT2,by="foo",all=TRUE)))
DF1=data.frame(foo=letters[1:5], bar=1:5, stringsAsFactors=TRUE)
DF2=data.frame(foo=letters[6:10], baz=6:10, stringsAsFactors=TRUE)
DT1=as.data.table(DF1)
DT2=as.data.table(DF2)
test(687, merge(DF1, DF2, by="foo", all=TRUE), as.data.frame(merge(DT1,DT2,by="foo",all=TRUE)))
# And a more basic test that #2114 revealed that factor to factor join was leaving NA in the i
# factor columns, caught in 1.8.1 beta before release to CRAN.
DT = data.table(a=factor(letters[1:4]), b=5:8, key="a")
test(688, DT[J(factor("b"))], data.table(a=factor("b"), b=6L, key="a"))
# Test removing a column followed by adding a new column using := by group, #2117
DT = data.table(a=1:3,b=4:6)
DT[,b:=NULL]
test(689, DT[,b:=.N,by=a], data.table(a=1:3, b=1L))
test(690, DT[,c:=2,by=a], data.table(a=1:3, b=1L, c=2))
# Test combining i with by, with particular out of order circumstances, #2118
set.seed(1)
DT=data.table(a=sample(1:5,20,replace=TRUE),b=1:4,c=1:10)
test(691, DT[a>2,sum(c),by=b], DT[a>2][,sum(c),by=b])
test(692, DT[a>2,sum(c),by=b%%2L], data.table(b=1:0,V1=c(34L,42L)))
test(693, DT[a>2,sum(c),by=(b+1)%%2], data.table(b=c(0,1),V1=c(34L,42L)))
setkey(DT,b)
test(694, DT[a>2,sum(c),by=b], DT[a>2][,sum(c),by=b])
test(695, DT[a>2,sum(c),by=b%%2L], data.table(b=1:0,V1=c(34L,42L)))
test(696, DT[a>2,sum(c),by=(b+1)%%2], data.table(b=c(0,1),V1=c(34L,42L)))
# Test subset and %chin% crash with non-character input, #2131
test(697, 4 %chin% letters, error="type")
test(698, 4L %chin% letters, error="type")
test(699, "a" %chin% 4, error="type")
DT = data.table(aa=1:6,bb=7:12)
test(700, subset(DT,select="aa"), DT[,list(aa)])
test(701, subset(DT,select=aa), DT[,list(aa)])
test(702, subset(DT,select=c(aa)), DT[,list(aa)])
setkey(DT,aa)
test(703, subset(DT,select="aa"), data.table(aa=1:6,key="aa"))
test(704, subset(DT,select=aa), data.table(aa=1:6,key="aa"))
test(705, subset(DT,select=c(aa)), data.table(aa=1:6,key="aa"))
# Test rbinding of logical columns, #2133
DT1 = data.table(A=1:3,B=letters[1:3],C=c(TRUE,TRUE,FALSE))
DT2 = data.table(A=4:5,B=letters[4:5],C=c(TRUE,FALSE))
test(706, rbind(DT1,DT2), data.table(A=1:5, B=letters[1:5], C=c(TRUE,TRUE,FALSE,TRUE,FALSE)))
test(707, rbindlist(list(DT1,DT2)), rbind(DT1,DT2))
# Test non ascii characters when passed as character by, #2134
# *****
# TO DO: reinstate. Temporarily removed to pass CRAN's Mac using C locale (R-Forge's Mac is ok)
# *****
# Test := adding column after a setnames of all column names (which [,list(x)] does), #2146
DT = data.table(x=1:5)[,list(x)]
test(713, DT[,y:=5], data.table(x=1:5,y=5))
if (ncol(DT)==2L) setnames(DT,c("A","B")) # else don't stop under torture with skip= such that test 713 was not run
test(714, DT[,z:=6:10], data.table(A=1:5,B=5,z=6:10))
# Test J alias is now removed outside DT[...] from v1.8.7 (to resolve rJava::J conflict)
test(715, J(a=1:3,b=4), error="could not find function.*J")
# Test get in j
DT = data.table(a=1:3,b=4:6)
test(716, DT[,get("b")], 4:6) # TO DO: add warning about inefficiency when datatable.pedantic=TRUE
test(717, DT[,get("b"),verbose=TRUE], output="ansvars being set to all columns")
# Test that j can be a logical index when `with=FALSE` (#1797)
DT = data.table(a=1:10, b=rnorm(10), c=letters[1:10])
test(718, DT[, c(FALSE, TRUE, FALSE), with=FALSE], DT[, 2, with=FALSE])
test(719, nrow(DT[, c(FALSE, FALSE, FALSE), with=FALSE]), 0L)
# Test combining join with missing groups with group by, #2162
DT = data.table(a = 1, b = 2, c = 4, key="a")
test(720, DT[list(c(5,6,7)), .N, by=b], data.table(b=NA_real_,N=3L))
test(721, DT[list(c(5,6,7))][, .N, by=b], DT[list(c(5,6,7)), .N, by=b])
test(722, DT[list(c(5,6,7)), .N, by=b, mult="first"], data.table(b=NA_real_,N=3L))
test(723, DT[list(c(5,6,7)), .N, by=b, nomatch=0], data.table(b=numeric(),N=integer())) # not keyed as by= not by key(DT)[1]
test(724, DT[list(c(5,6,7)), .N, by=b, nomatch=0], DT[list(c(5,6,7)),nomatch=0][,.N,by=b]) # Splitting should always be consistent
# another test linked from #2162
DT = data.table(x=rep(c("a","b","c"),each=3), y=c(1L,3L,6L), v=1:9, key="x")
test(725, DT[c("a","b","d"),list(v)], DT[J(c("a","b","d")),"v",with=FALSE]) # unfiled bug fix for NA matches; see NEWS 1.8.3
test(726, DT[c("a", "b", "d"), sum(v), by=y, nomatch=0], data.table(y=INT(1,3,6),V1=INT(5,7,9)))
test(727, DT[c("a", "b", "d"), sum(v), by=y], data.table(y=INT(1,3,6,NA),V1=INT(5,7,9,NA)))
test(728, DT[c("a", "b", "d"), sum(v), by=y], DT[J(c("a", "b", "d"))][, sum(v), by=y])
# explicit verbose=FALSE needed here because tests are run a second time with verbose=TRUE
test(729.1, capture.output(DT[c("a", "b", "d"), print(.SD), by=.EACHI, verbose=FALSE]),
capture.output(suppressWarnings(DT[c("a", "b", "d"), print(.SD), by=x, verbose=FALSE])))
test(729.2, capture.output(DT[c("a", "b"), print(.SD), by=y, verbose=FALSE]), # TO DO: why doesn't last group have x=d, maybe groups=i in dogroups
capture.output(DT[c("a", "b"),verbose=FALSE][, print(.SD), by=y, verbose=FALSE]))
test(729.3, DT[c("b","d"),.SD,by=.EACHI], data.table(x=c("b","b","b","d"),y=INT(1,3,6,NA),v=INT(4,5,6,NA))) # no debate here
test(729.4, DT[c("b","d"),.SD, by=y], DT[c("b","d")][,.SD, by=y][4L,x:=NA_character_]) # the i groups when no match don't get carried through (would be hard to implement this and very unlikely to be useful. Just break into compound query, if needed to be used in j, to get them to carry through. TO DO: add to FAQ.
# That unnamed i gets x's join column names when j is .SD (or any named list, which verbose warns is inefficient), #2281
test(729.5, DT[c("a","b"),.SD], data.table(x=rep(c("a","b"),each=3),y=INT(1,3,6),v=1:6,key="x"))
# check := when combining join with missing groups and then group by
test(730, DT[c("b","a"),w:=sum(v),by=y]$w, INT(5,7,9,5,7,9,NA,NA,NA)) # by over a different column than was joined to
test(731, DT["d",w:=99,by=y]$w, INT(5,7,9,5,7,9,NA,NA,NA)) # do nothing for missing group, before getting as far as type error
test(732, DT["d",w:=99L,by=y]$w, INT(5,7,9,5,7,9,NA,NA,NA)) # do nothing for missing group
test(733, DT[c("c","e","b"),w:=sum(v),by=y%%2L]$w, INT(5,7,9,24,24,15,24,24,15))
# Test column type change in the 0 row case (#2274)
DT = data.table(a=1:3,b=4:6)[0]
test(734, DT[,b:=as.character(b)], data.table(a=integer(),b=character()))
test(735, DT[,c:=double()], data.table(a=integer(),b=character(),c=double()))
# Deleting multiple columns out-of-order, #2223
DT = data.table(a=1:3,b=4:6,c=7:9,d=10:12,e=13:15,f=16:18,g=19:21)
test(736, DT[,c("b","d","g","f","c"):=NULL], data.table(a=1:3,e=13:15)) # test redundant with=FALSE is ok
DT = data.table(a=1:3,b=4:6,c=7:9,d=10:12,e=13:15,f=16:18,g=19:21)
test(737, DT[,c("b","d","g","f","c"):=NULL], data.table(a=1:3,e=13:15)) # with no longer needed
# Mixing column adds and deletes in one := gave incorrect results, #2251.
DT = data.table(c1=1:2)
test(738, DT[,c("c2", "c1"):=list(c1+1L, NULL)], data.table(c2=2:3))
# `:=`(c1=v1,v2=v2,...) is now valid , #2254
DT = data.table( c1=1:3 )
test(739, DT[,`:=`(c2=4:6, c3=7:9)], data.table(c1=1:3,c2=4:6,c3=7:9))
test(740, DT[,`:=`(4:6,c3=7:9)], error="all arguments must be named")
test(741, DT[,`:=`(4:6,7:9,10:12)], error="all arguments must be named") # test the same error message in the other branch
# that out of bounds LHS is caught, root cause of #2254
test(742, DT[,3:6:=1L], error="outside.*range")
test(743, DT[,2:3:=99L], data.table(c1=1:3,c2=99L,c3=99L))
test(744, DT[,(ncol(DT)+1):=1L], error="outside.*range")
test(745, DT[,ncol(DT):=1L], data.table(c1=1:3,c2=99L,c3=1L))
# multiple LHS with by without by, #2215
DT = data.table(a=letters[c(1:3,3L)],key="a")
test(746, DT["a",c("new1","new2"):=list(4L, 5L)],
data.table(a=letters[c(1:3,3L)],new1=INT(4,NA,NA,NA),new2=INT(5,NA,NA,NA),key="a"))
test(747.1, DT[,new1:=4:6], error="Supplied 3 items to be assigned to 4 items of column 'new1'")
test(747.2, DT[,new1:=INT(4,5,6,4)], data.table(a=letters[c(1:3,3L)],new1=INT(4L,5L,6L,4L),new2=INT(5,NA,NA,NA),key="a"))
test(748, DT[c("c","b"),`:=`(new3=.N,new2=sum(new1)+1L),by=.EACHI], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a"))
# and multiple LHS by group, #1710
DT = data.table(a=rep(6:8,1:3),b=1:6)
test(749, DT[,c("c","d","e"):=list(.N,sum(b),a*10L),by=a], data.table(a=rep(6:8,1:3),b=1:6,c=rep(1:3,1:3),d=INT(rep(c(1,5,15),1:3)),e=rep(6:8,1:3)*10L))
test(750, DT[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA)))
# varname holding colnames, by group, linked from #2120.
DT = data.table(a=rep(1:3,1:3),b=1:6)
colname = "newcol"
test(751, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15)))
# Add tests for nested := in j by group, #1987
DT = data.table(a=rep(1:3,2:4),b=1:9)
test(752, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2))
# Test duplicate() of recycled plonking RHS, #2298
DT = data.table(a=letters[3:1],x=1:3)
test(753, setkey(DT[,c("x1","x2"):=x],a), data.table(a=letters[1:3],x=3:1,x1=3:1,x2=3:1,key="a"))
test(753.1, DT[,c("x1","x2"):=4:6, verbose = TRUE], data.table(a=letters[1:3],x=3:1,x1=4:6,x2=4:6,key="a"),
output = "RHS for item 2 has been duplicated")
test(753.2, DT[2,x2:=7L], data.table(a=letters[1:3],x=3:1,x1=4:6,x2=c(4L,7L,6L),key="a"))
DT = data.table(a=letters[3:1],x=1:3,y=4:6)
test(754.1, DT[,c("x1","y1","x2"):=list(x,y)], error="Supplied 3 columns to be assigned 2 items. Please see NEWS for v1.12.2")
test(754.2, DT[,c("x1","y1","x2"):=list(x,y,x)], data.table(a=letters[3:1],x=1:3,y=4:6,x1=1:3,y1=4:6,x2=1:3))
# And non-recycling i.e. that a single column copy does copy the column
DT = data.table(a=1:3)
test(754.3, DT[,b:=a][1,a:=4L][2,b:=5L], data.table(a=INT(4,2,3),b=INT(1,5,3)))
test(754.4, DT[,b:=a][3,b:=6L], data.table(a=INT(4,2,3),b=INT(4,2,6)))
test(754.5, DT[,a:=as.character(a),verbose=TRUE], output="Direct plonk.*no copy")
RHS = as.integer(DT$a)
test(754.6, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated")
# Used to test warning on redundant by (#2282) but by=.EACHI has now superseded
DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key="a,b")
test(755, DT[c("b","c"),sum(x),by=.EACHI], data.table(a=c("b","c"),V1=c(7L,9L),key="a"))
test(756, DT[c("b","c"),sum(x),by=a], data.table(a=c("b","c"),V1=c(7L,9L),key="a"))
test(757, DT[list(c("b","c"),"d"),sum(x),by=a], data.table(a=c("b","c"),V1=2:3,key="a")) # 'by' less than number of join columns
# join then by when mult=="last"|"first", #2303 (crash in dev 1.8.3 only)
DT = data.table(a=1:3,b=1:6,c=7:12,key="a")
test(758, DT[J(c(1L,1L)),sum(c),by=b,mult="last"], DT[J(c(1L,1L)),mult="last"][,sum(c),by=b])
test(759, DT[J(1L),c,by=b,mult="last"], DT[J(1L),mult="last"][,c,by=b])
test(760, DT[2:5,sum(c),by=b], DT[2:5][,sum(c),by=b])
test(761, DT[2:5,sum(c),by=b%%2], DT[2:5][,sum(c),by=b%%2])
# joining from empty i table, #2194
DT = data.table(a=1:3,b=4:6,key="a")
test(762, DT[J(integer()),b,by=.EACHI], data.table(a=integer(),b=integer(),key="a"))
test(763, DT[J(integer()),1L,by=b], data.table(b=integer(),V1=integer())) # by= not by key(DT)[1] so result is not keyed
test(764, DT[J(integer()),b,mult="last"], integer())
test(765, DT[J(2L),b,mult="last"], 5L)
test(766, DT[J(5L),b,nomatch=0,by=.EACHI], data.table(a=integer(),b=integer(),key="a"))
test(767, DT[J(5:6),b,nomatch=0,by=.EACHI], data.table(a=integer(),b=integer(),key="a"))
# Crash on by-without-by with mixed type non join i columns, #2314. Despite not being used by j they were still being assigned to .BY.
DT = data.table(iris,key="Species")
Y = data.table(date=as.POSIXct("2011-01-01"),num=as.numeric(1:26))
Y[,get("letters"):=LETTERS]
Y[,A:=1:26]
Y[,p:=factor(p)] # coerce type to match DT$Species to save warning. Crash was related to .BY internally, not the coercion.
setkey(Y,p)
for (i in 1:10){DT[Y,Petal.Width];DT[Y];NULL} # reliable crash in 1.8.2 (tested).
test(768, DT[Y,Petal.Width,by=.EACHI], data.table(Species=factor(LETTERS),Petal.Width=NA_real_,key="Species"))
DT = data.table(a=1:3,b=1:6,c=7:12, key="a")
test(769, DT[,.BY[[1]]==a,by=a], data.table(a=1:3,V1=TRUE,key="a"))
test(770, DT[J(2:3),.BY[[1]]==b,by=.EACHI], data.table(a=INT(2,2,3,3),V1=c(TRUE,FALSE),key="a"))
# A data.table RHS of := caused a crash, #2311.
a = data.table(first=1:6, third=c(1,1,1,3,3,4), key="first")
b = data.table(first=c(3,4,4,5,6,7,8), second=1:7, key="first")
test(771, b[,third:=a[b,third,by=.EACHI]], error="Supplied 2 items to be assigned to 7 items of column 'third'")
test(772, b[,third:=as.list(a[b,third,by=.EACHI])], error="Supplied 2 items to be assigned to 7 items of column 'third'")
test(773, b[,third:=a[b,third,mult="first"]], ans<-data.table(first=c(3,4,4,5,6,7,8), second=1:7, third=c(1,3,3,3,4,NA,NA), key="first"))
test(774, b[,third:=a[b,third]], ans) # mult="first" no longer needed as from v1.9.3. It now does what was naturally expected.
# That names are dropped. (Names on the column vectors don't display. They increase size and aren't much use.)
DT = data.table(a=1:3,b=LETTERS[1:3])
map = c("A"="Foo",B="Bar",C="Baz")
DT[,b:=map[b]]
test(775, names(DT$b), NULL)
# Test that names of named vectors don't carry through, #2307.
DT = data.table(a=1:3,b=c("a"="a","b"="a","c"="b"))
test(776, names(DT$b), NULL) # From v1.8.11, data.table() drops vector names
DT = data.table(a=1:3,b=c("a","a","b"))
setattr(DT$b, "names", c("a","b","c")) # Force names in there to test #2307
test(777, names(DT$b), c("a","b","c"))
test(778, DT[,sum(a),by=b], data.table(b=c("a","b"),V1=c(3L,3L))) #2307 retained names length 3 on the length 2 vector result causing it not to print.
test(779, print(DT[,sum(a),by=b]), output=" b V1\n1: a 3\n2: b 3$")
# Test new .GRP binding
test(780, data.table(a=1:3,b=1:6)[,i:=.GRP,by=a][,i2:=.GRP], data.table(a=1:3,b=1:6,i=rep(1:3,2),i2=1L))
# Test new .I binding
DT = data.table(a=1:4,b=1:8)
test(781, DT[,.I,by=a]$I, INT(1,5,2,6,3,7,4,8))
test(782, DT[,.I[which.max(b)],by=a], data.table(a=1:4,V1=5:8))
setkey(DT,a)
test(783, DT[,.I,by=a]$I, 1:8)
test(784, DT[,.I[which.max(b)],by=a], data.table(a=1:4,V1=INT(2,4,6,8),key="a"))
test(785, DT[J(2:4),.I,by=a%%2L], data.table(a=rep(0:1,c(4,2)),I=INT(3,4,7,8,5,6)))
test(786, DT[J(c(3,2,4)),list(.I,.GRP),by=.EACHI], data.table(a=rep(c(3L,2L,4L),each=2),I=INT(5,6,3,4,7,8),GRP=rep(1:3,each=2L)))
test(787, DT[J(3:2),`:=`(i=.I,grp=.GRP),by=.EACHI][,list(i,grp)], data.table(i=INT(NA,NA,3:6,NA,NA),grp=INT(NA,NA,2,2,1,1,NA,NA)))
# New not-join (a.k.a. not-select, since not just for data.table i but integer, logical and character too)
DT = data.table(A=rep(1:3,each=2),B=1:6,key="A")
test(788, DT[!J(2)], data.table(A=c(1L,1L,3L,3L),B=c(1L,2L,5L,6L),key="A"))
test(789, DT[!(2:6)], DT[1])
test(790, DT[!(2:6)], DT[!2:6]) # nicer than DT[-2:6] applying - to 2 first
test(791, DT[!6], DT[1:5])
test(792.1, DT[!rep(c(TRUE,FALSE),length.out=.N)], DT[rep(c(FALSE,TRUE),length.out=.N)])
test(792.2, DT[!A>=2], DT[A<2])
test(793, setkey(DT[,A:=letters[A]],A)[!c("b","c")], DT["a"])
test(794, DT[!"b"], DT[c("a","c")])
test(795, DT[!0], DT)
test(796, DT[!NULL], DT[NULL])
test(797, DT[!integer()], DT)
test(798, DT[!-1], DT[1])
test(799, DT[--1], DT[1])
myi = c("a","c")
test(800, DT[!myi], DT["b"])
test(801, DT[!"c",sum(B),by=A], data.table(A=c("a","b"),V1=c(3L,7L),key="A"))
test(802, DT[!"missing",sum(B),by=A], DT[,sum(B),by=A])
test(803, DT[!c("a","missing","b","missing2"),sum(B),by=A], DT["c",sum(B),by=.EACHI])
# Combining not-join with which
test(804, DT[!"b",which=TRUE], INT(1:2,5:6)) # row numbers in DT that don't match
# New which=NA value
test(805, DT[c("b","foo","c"),which=NA], 2L) # row numbers in i that don't match
test(806, DT[!c("b","foo","c"),which=NA], c(1L,3L)) # row numbers in i that do match
test(807, DT[!c("b","foo","c"),nomatch=0], error="not-join.*prefix is present on i.*Please remove nomatch")
test(808, DT[c("b","foo","c"),which=TRUE,nomatch=NA], INT(3:4,NA,5:6))
test(809, DT[c("b","foo","c"),which=TRUE,nomatch=0], INT(3:4,5:6))
test(810, DT[c("b","foo","c"),which=NA,nomatch=NA], 2L)
test(811, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0 would always return an empty vector[.] Please change or remove either which or nomatch")
# New notj for column names and positions when with=FALSE, #1384
DT = data.table(a=1:3,b=4:6,c=7:9)
# old tests using with=FALSE retained. Eventually will deprecate with=FALSE.
test(812.1, DT[,!"b",with=FALSE], DT[,-match("b",names(DT)),with=FALSE])
test(812.2, DT[,"foo",with=FALSE], error="column(s) not found: foo")
test(812.3, DT[,!"foo",with=FALSE], DT, warning="column(s) not removed because not found: [foo]")
test(812.4, DT[,!c("b","foo"),with=FALSE], DT[,list(a,c)], warning="column(s) not removed because not found: [foo]")
test(812.5, DT[,!2:3,with=FALSE], DT[,-(2:3),with=FALSE]) # for consistency, but ! is really for character column names
mycols = "b"
test(812.6, DT[,!mycols,with=FALSE], DT[,list(a,c)])
test(812.7, DT[,-mycols,with=FALSE], DT[,list(a,c)])
mycols = 2
test(812.8, DT[,!mycols,with=FALSE], DT[,list(a,c)])
test(812.9, DT[,-mycols,with=FALSE], DT[,list(a,c)])
# new tests for v1.12.0 to cover #3216 and #3217 (rownames of CsubsetDT when i=NULL was new and used more by [.data.table)
test(813.1, rownames(DT[,!"b"]), rn<-c("1","2","3"))
test(813.2, rownames(DT[,-"b"]), rn)
test(813.3, rownames(DT[,"a"]), rn)
test(813.4, rownames(DT[2,"a"]), "1")
# also repeat 812.* but without with=FALSE since that will be deprecated in future, and cover - as well as !
test(814.01, DT[,!"b"], DT[,c("a","c")])
test(814.02, DT[,-"b"], DT[,c("a","c")])
test(814.03, DT[,"foo"], error="column(s) not found: foo")
test(814.04, DT[,!"foo"], DT, warning="column(s) not removed because not found: [foo]")
test(814.05, DT[,-"foo"], DT, warning="column(s) not removed because not found: [foo]")
test(814.06, DT[,!c("b","foo")], DT[,list(a,c)], warning="column(s) not removed because not found: [foo]")
test(814.07, DT[,-c("b","foo")], DT[,list(a,c)], warning="column(s) not removed because not found: [foo]")
test(814.08, DT[,!2:3], DT[,"a"]) # for consistency, and ! avoids needing to wrap with () as in next test
test(814.09, DT[,-(2:3)], DT[,"a"])
mycols = "b"
test(814.10, DT[,!..mycols], ans<-data.table(a=1:3, c=7:9))
test(814.11, DT[,-..mycols], ans)
mycols = 2
test(814.12, DT[,!..mycols], ans)
test(814.13, DT[,-..mycols], ans)
# Test X[Y] slowdown, #2216
# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes
# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw.
X = CJ(a=seq_len(1e3),b=seq_len(1e3))
Y = copy(X)
X[4,b:=3L] # create a dup group, to force allLen1=FALSE
setkey(X)
test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case
test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case
# Optimization of lapply(,"+"), #2212
DT = data.table(a=rep(1:3,each=2L),b=1:6,c=7:12)
ans = data.table(a=rep(1:3,each=2L),b=INT(2,3,5,6,8,9),c=INT(8,9,11,12,14,15))
test(821, DT[,lapply(.SD, "+", a), by=a], ans)
test(822, DT[,lapply(.SD, `+`, a), by=a], ans)
ans = data.table(a=1:3,b=INT(4,9,14),c=INT(16,21,26))
test(823, DT[,lapply(.SD, "sum", a), by=a], ans)
test(824, DT[,lapply(.SD, sum, a), by=a], ans)
test(825, DT[,lapply(.SD, `sum`, a), by=a], ans)
DT[2,b:=NA_integer_]
test(825.1, DT[,lapply(.SD, function(x)sum(x)), by=a], data.table(a=1:3,b=INT(NA,7,11),c=INT(15,19,23)))
test(825.2, DT[,lapply(.SD,function(x,...)sum(x,...),na.rm=TRUE),by=a], data.table(a=1:3,b=INT(1,7,11),c=INT(15,19,23)))
test(825.3, DT[,lapply(.SD,sum,na.rm=TRUE),by=a], data.table(a=1:3,b=INT(1,7,11),c=INT(15,19,23)))
# Test illegal names in merge are ok and setcolorder length error, #2193i and #2090
DT1 = data.table(a=letters[1:5], "Illegal(name%)"=1:5, key="a")
DT2 = data.table(a=letters[1:5], b=6L, key="a")
test(826, merge(DT1,DT2), cbind(DT1,b=6L))
test(827, merge(DT2,DT1), cbind(DT2,"Illegal(name%)"=1:5))
a=data.table('User ID'=c(1,2,3), 'Blah Blah'=c(1,2,3), key='User ID') #2090's test
b=data.table('User ID'=c(1,2,3), 'Yadda Yadda'=c(1,2,3), key='User ID')
test(827.1, names(a[b]), c("User ID","Blah Blah","Yadda Yadda"))
# setcolorder and merge check for dup column names, #2193(ii)
setnames(DT2,"b","a")
test(828, setcolorder(DT2,c("a","b")), error="x has some duplicated column name(s): a. Please remove or rename")
test(829, merge(DT1,DT2), error="y has some duplicated column name(s): a. Please remove or rename")
test(830, merge(DT2,DT1), error="x has some duplicated column name(s): a. Please remove or rename")
# attribs such as "comments" should be retained, #2270
DT1 <- data.table(id = seq.int(1, 10), A = LETTERS[1:10], key = "id")
comment(DT1$A) <- "first comment" # copies, setattr would be better as on next line
DT2 <- data.table(id = seq.int(2, 10, 2), b = letters[1:5], key = "id")
setattr(DT2$b,"comment","second comment")
test(831, comment(DT1[DT2]$A), "first comment")
test(832, comment(DT2[DT1]$b), "second comment")
test(833, sapply(merge(DT1,DT2),comment), list(id=NULL, A="first comment", b="second comment"))
test(834, comment(DT1[2:3]$A), "first comment")
# Test that matrix RHS of := is caught, #2333
DT = data.table(a=1:3)
DT[,a:=scale(a)] # 1 column matrix auto treated as vector
test(835, na.omit(DT), DT)
test(836, DT[,a:=as.integer(a)], data.table(a=INT(-1,0,1)))
test(837, DT[,a:=cbind(1,2)],
warning = "2 column matrix RHS of := will be treated as one vector",
error = "Supplied 2 items to be assigned to 3 items of column 'a'")
DT = data.table(a=1:3,b=1:6)
test(838, DT[,c:=scale(b), by=a][,c:=as.integer(1000*c)], data.table(a=1:3,b=1:6,c=rep(as.integer(1000*scale(1:2)), each=3)))
# Test data.table's last(). (last is used internally in data.table, too).
# Compatibility with xts::last is tested in other.Rraw
test(839, last(1:10), 10L)
DT = data.table(a=1:3,b=4:6)
test(840, last(DT), DT[3L])
# Test L[[1L]][,:=] updates by reference, #2204
l = list(data.table(a=1:3), data.table(b=4:6))
test(843, l[[2L]][,c:=7:9], data.table(b=4:6,c=7:9))
test(844, l, list(data.table(a=1:3), data.table(b=4:6,c=7:9)))
names(l) = c("foo","bar") # R >= 3.1 no longer copies all the contents, yay
test(845, l[["foo"]][2,d:=4L], data.table(a=1:3,d=c(NA,4L,NA)))
l = list(data.table(a=1:3), data.table(b=4:6))
setattr(l,"names",c("foo","bar"))
test(846, l[["foo"]][2,d:=4], data.table(a=1:3,d=c(NA,4,NA)))
test(847, l, list(foo=data.table(a=1:3,d=c(NA,4,NA)), bar=data.table(b=4:6)))
options(datatable.alloccol=0L)
l = list(foo=data.table(a=1:3,b=4:6),bar=data.table(c=7:9,d=10:12)) # list() doesn't copy the NAMED==0 objects here
test(848, truelength(l[[1L]]), 2L)
test(849, {l[[1L]][,e:=13:15]; l[[1L]]}, data.table(a=1:3,b=4:6)[,e:=13:15])
test(850, truelength(l[[1L]]), 3L)
test(851, truelength(l[[2L]]), 2L)
options(datatable.alloccol=1L)
l[["bar"]][,f:=16:18]
test(852, truelength(l[[2L]]), 4L)
options(datatable.alloccol=1024L)
# Now create the list from named objected
DT1 = data.table(a=1:3, b=4:6)
DT2 = data.table(c=7:9)
l = list(DT1, DT2)
# From R>=3.1, list() no longer copies NAMED inputs (a very welcome change in Rdevel, r63767)
test(853, address(DT1) == address(l[[1L]]))
test(854, l[[1]][,d:=10:12], data.table(a=1:3,b=4:6,d=10:12))
test(855, l[[1]], data.table(a=1:3,b=4:6,d=10:12))
# Test setnames on data.frame, #2273.
DF = data.frame(foo=1:2,bar=3:4)
setnames(DF,c("baz","qux"))
test(856, DF, data.frame(baz=1:2,qux=3:4))
test(857.1, set(DF,NULL,"quux",5:6), error="set() on a data.frame is for changing existing columns, not adding new ones")
test(857.2, set(DF,NULL,3L,5:6), error="set() on a data.frame is for changing existing columns, not adding new ones")
test(858.1, set(DF,NULL,"qux",5:6), data.frame(baz=1:2, qux=5:6))
test(858.2, set(DF,NULL,2L,7:8), data.frame(baz=1:2, qux=7:8))
# Test DT[J(data.frame())], #2265
DT = data.table(foo=c(1,2,3), bar=c(1.1,2.2,3.3), key="foo")
i = data.frame(foo=1)
test(859, DT[i], DT[J(i)])
test(860, DT[i], DT[data.table(i)])
# test no memory leak, #2191 and #2284
# These take a few seconds each, and it's important to run these on CRAN to check no leak
gc(); before = gc()["Vcells","(Mb)"]
for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB
gc(); after = gc()["Vcells","(Mb)"]
test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin
gc(); before = gc()["Vcells","(Mb)"]
DF = data.frame(x=1:20, y=runif(20))
for (i in 1:2000) { DT = as.data.table(DF); rm(DT) }
gc(); after = gc()["Vcells","(Mb)"]
test(862, after < before+0.5)
gc(); before = gc()["Vcells","(Mb)"]
DT = data.table(x=1:20, y=runif(20))
for (i in 1:2000) { x <- DT[1:5,]; rm(x) }
gc(); after = gc()["Vcells","(Mb)"]
test(863, after < before+0.5)
# rbindlist should look for the first non-empty data.table - New changes (from Arun). Explanation below:
# Even if data.table is empty, as long as there are column names, they should be considered.
# Ex: What if all data.tables are empty? What'll be the column name then?
# If there are no names, then the first non-empty set of names will be allocated.
test(864.1, rbindlist(list(data.table(foo=logical(0),bar=logical(0)), DT<-data.table(baz=letters[1:3],qux=4:6))),
setnames(DT, c("foo", "bar")),
message="Column 1 [[]'baz'[]] of item 2 is missing in item 1.*Use fill=TRUE.*or use.names=FALSE.*v1.12.2") # test 676 tests no warning when use.names=FALSE
test(864.2, rbindlist(list(list(logical(0),logical(0)), DT<-data.table(baz=letters[1:3],qux=4:6))), DT)
test(864.3, rbindlist(list(data.table(logical(0),logical(0)), DT<-data.table(baz=letters[1:3],qux=4:6))),
setnames(DT, c("V1", "V2")),
message="Column 1 [[]'baz'[]] of item 2 is missing in item 1.*Use fill=TRUE.*or use.names=FALSE.*v1.12.2")
# Steve's find that setnames failed for numeric 'old' when pointing to duplicated names
DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6)
options(datatable.optimize = 0L)
test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="(GForce FALSE)")
options(datatable.optimize = 1L)
test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="(GForce FALSE)")
options(datatable.optimize = 2L)
test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE],
output="GForce optimized.*gsum[(]v[)], gsum[(]w[)]") # v1.9.7 treats wrapped {} better, so this is now optimized
options(datatable.optimize = Inf)
test(866, names(ans1), c("a","b","name1","name2"))
test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by="a,b"]), c("a","b","name1","name2")) # list names extracted here
test(868, ans1, ans2)
# and related to setnames, too
DT = data.table(a=1:3,b=1:6,key="a")
test(869, DT[J(2,42,84),print(.SD),by=.EACHI], output=" b\n.*1.*2\n2:.*5.*Empty data.table [(]0 rows and 3 cols[)]: a,V2,V3") # .* for when verbose mode
# Test setnames with duplicate colnames
DT = data.table(a=1:3,b=4:6,b=7:9)
test(870, setnames(copy(DT),"b","foo"), data.table(a=1:3, foo=4:6, b=7:9),
warning="Item 1 of 'old' is 'b' which appears several times in column names.*There are 0 other")
test(871, setnames(DT,c("bar","bar"),c("x","y")), error="Some duplicates exist in 'old': [bar]")
test(872, setnames(DT,3,"c"), data.table(a=1:3,b=4:6,c=7:9))
test(873, setnames(DT,"foo","bar"), error="Items of 'old' not found in column names: [foo]")
test(874, setnames(DT,c(1,1),c("foo","bar")), error="Some duplicates exist in 'old': [1]")
test(875, setnames(DT,"c","b"), data.table(a=1:3,b=4:6,b=7:9))
test(875.1, setnames(DT,"a","c"), data.table(c=1:3,b=4:6,b=7:9)) # 'a' isn't duplicated so not a problem as from v1.8.11
test(875.2, setnames(DT,c("c","b"), c("C","B")), data.table(C=1:3, B=4:6, b=7:9),
warning="Item 2 of 'old' is 'b' which.*There are 0 other")
DT = data.table(a=1:3,b=4:6,b=7:9,a=10:12)
test(875.3, setnames(DT,c("a","b"),c("A","B")), data.table(A=1:3, B=4:6, b=7:9, a=10:12),
warning="Item 1 of 'old' is 'a' which.*There are 1 other")
# Test local var problem introduced in v1.8.3
DT = data.table(a=1:3,b=1:6)
f = function() {
localvar = 2
print(DT[a>localvar])
print(DT[a>localvar,sum(b)])
print(DT[a>localvar,sum(b),by=a]) # bug fix 2368
}
test(876, f(), output=" a b\n1: 3 3\n2: 3 6.*[[]1[]] 9.* a V1\n1: 3 9")
# segfault when assigning NA names, #2393
DT = data.table(a=1:3, b=4:6)
test(877, setnames(DT, c(NA, NA)), error="Passed a vector of type 'logical'. Needs to be type 'character'")
# test no warning when use.names explicitly set, #2385 - changed 'warning' to 'message' as we just check if usenames is missing, due to C-level changes.
# commented the message for now until confirmation with Matt.
test(878, rbind(data.table(a=1:3,b=4:6), data.table(b=7:9,a=4:6)), data.table(a=1:6,b=4:9)) #, message="Columns will be bound by name for consistency with base")
test(879, rbind(data.table(a=1:3,b=4:6), data.table(b=7:9,a=4:6), use.names=TRUE), data.table(a=1:6,b=4:9))
# Test fread()
n=110 # 110 just to be over the 100 limit for printing head, as a convenience
DT = data.table( a=sample(1:1000,n,replace=TRUE),
b=sample(1:1000,n,replace=TRUE)-500L,
c=rnorm(n),
d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE),
e=rnorm(n),
f=sample(1:1000,n,replace=TRUE) )
DT[2,b:=NA_integer_]
DT[4,c:=NA_real_]
DT[3,d:=NA_character_]
DT[5,d:=""]
DT[2,e:=+Inf]
DT[3,e:=-Inf]
DT[4,e:=NaN] # write.table writes NaN as NA, though, and all.equal considers NaN==NA. fread would read NaN as NaN if "NaN" was in file
write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) # na="NA" seems like a bad default for string columns here
test(880, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE)))
test(881, fread(f), DT)
# test that columns are not coerced if nastring=NULL
DT[3,d:="NA"]
test(882, fread(f,na.strings=NULL)[['d']], DT[['d']])
DT[3,d:=NA_character_]
unlink(f)
write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=TRUE)
test(883, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE)))
test(884, fread(f), DT)
unlink(f)
# Test short files.
# All the unlinks and using a new file each time are to work around apparent Windows issues it seems when writing, appending
# rereading (possibly via the MapViewOfFile) the same file that has just been appended to. These apparent issues have only
# showed up on winbuilder so far, so might be in combination with the D: tempdir() there; perhaps D: is on a network drive or something.
test(885.1, fread(""), error="empty")
test(885.2, fread(), error="empty")
cat("", file=f<-tempfile()); test(885.3, fread(f), data.table(NULL), warning="size 0. Returning a NULL data.table"); unlink(f)
cat(" ", file=f<-tempfile()); test(885.4, fread(f), error="either empty.*whitespace.*or skip has"); unlink(f)
test(885.5, fread(" "), error="space")
test(886, fread("\n"), error="empty")
test(887, fread(" \n\t \t \n \n "), error="empty")
cat("A", file=f<-tempfile()); test(888, fread(f), data.table(A=logical())); unlink(f)
test(889, fread("A\n"), data.table(A=logical()))
cat("AB,CDE",file=f<-tempfile()); test(890, fread(f), data.table(AB=logical(),CDE=logical())); unlink(f)
test(891, fread("AB,CDE\n"), data.table(AB=logical(),CDE=logical()))
cat("3.14",file=f<-tempfile()); test(892, fread(f), data.table(V1=3.14)); unlink(f)
cat("1.23\n3.14",file=f<-tempfile()); test(892.1, fread(f), data.table(V1=c(1.23,3.14))); unlink(f)
cat("A,3",file=f<-tempfile()); test(893, fread(f), data.table(V1="A",V2=3L)); unlink(f)
if (.Platform$OS.type=="unix") test(893.5, fread("A,B\r\n\r\n"), data.table(A=logical(),B=logical()))
eols = c("\n", "\r\n", "\r", "\n\r", "\r\r\n")
for (nr in c(0,1,2,3,4,55,98,99,100,101,102)) { # include around first 100 line threshold
for (nc in c(0,1,2)) { # 0 means all cols here
for (ne in seq_along(eols)) {
eol = eols[ne]
headDT = head(DT,nr)[,seq_len(if (nc==0) ncol(DT) else nc),with=FALSE]
if (nr==0) for (j in seq_len(ncol(headDT))) set(headDT,j=j,value=logical()) # when read back in empty cols are the lowest type (logical)
f = tempfile()
lines = capture.output(fwrite(headDT, verbose=FALSE))
cat(paste(lines,collapse=eol), file=f, sep="") # so last line abruptly ends (missing last eol) to test that, otherwise could just pass eol to fwrite
# on unix we simulate Windows too. On Windows \n will write \r\n (and \r\n will write \r\r\n)
num = 894 + nr/100 + nc/1000 + ne/10000
# if (isTRUE(all.equal(testIDtail, 0.4103))) browser()
test(num+0.00001, fread(f,na.strings=""), headDT)
cat(eol,file=f,append=TRUE) # now a normal file properly ending with final \n
test(num+0.00002, fread(f,na.strings=""), headDT)
cat(eol,file=f,append=TRUE) # extra \n should be ignored other than for single columns where it is significant
test(num+0.00003, fread(f,na.strings=""), if (nc==1) rbind(headDT, list(NA)) else headDT)
unlink(f)
}}}
if (test_bit64) {
n = 2100
# To test out-of-sample type diffs, nrow needs to be more than 100*10*2. Under that, all rows are sampled.
DT = data.table( a=sample(1:1000,n,replace=TRUE),
b=sample(as.integer64(2)^35 * 1:10, n, replace=TRUE),
c=sample(c("foo","bar","baz"),n,replace=TRUE) )
fwrite(DT,f<-tempfile())
test(897, class(DT$b), "integer64")
test(898, fread(f), DT)
unlink(f)
DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))]
DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))]
DT[,r:=a/100][,r2:=gsub(" ","",format(r))]
DT[112, a2:=as.integer64(12345678901234)] # start on row 112 to avoid the first 100
DT[113, a3:=3.14]
DT[114, a4:="123A"]
DT[115, b2:=1234567890123.45]
DT[116, b3:="12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR)
DT[117, r2:="3.14A"]
fwrite(DT,f<-tempfile())
test(899.1, fread(f, verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>")
test(899.2, fread(f, colClasses=list(character=c("a4","b3","r2"), integer64="a2", double=c("a3","b2")), verbose=TRUE),
DT, output="Rereading 0 columns due to out-of-sample type exceptions")
test(899.3, fread(f, integer64="character", select=c("a","b","c")), DT[, .(a, b=as.character(b), c)])
# leaving integer64='character' version of 899.1,899.2 until #2749 is fixed
unlink(f)
}
# getwd() has been set by test.data.table() to the location of this tests.Rraw file. Test files should be in the same directory.
if (test_R.utils) {
f = testDir("ch11b.dat.bz2") # http://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat
test(900.1, fread(f, logical01=FALSE), as.data.table(read.table(f)))
test(900.2, fread(f, logical01=TRUE), as.data.table(read.table(f))[,V5:=as.logical(V5)])
f = testDir("1206FUT.txt.bz2") # a CRLF line ending file (DOS)
test(901.1, DT<-fread(f,strip.white=FALSE), setDT(read.table(f,sep="\t",header=TRUE,colClasses=as.vector(sapply(DT,class)))))
test(901.2, DT<-fread(f), setDT(read.table(f,sep="\t",header=TRUE,colClasses=as.vector(sapply(DT,class)),strip.white=TRUE)))
}
# Test the coerce of column 23 to character on line 179 due to the 'A' for the first time.
# As from v1.9.8 the columns are guessed better and there is no longer a warning. Test 899 tests the warning.
# Columns 'Cancelled' and 'Diverted' seem boolean (so logical01=TRUE good default for those) but Month just happens to be all-Jan
f = testDir("2008head.csv.bz2")
if (test_R.utils) test(902, fread(f,logical01=FALSE), as.data.table(read.csv(f,stringsAsFactors=FALSE)))
test(903, fread("A,B\n1,3,foo,5\n2,4,barbaz,6"), data.table(A=1:2, B=3:4, V3=c("foo","barbaz"), V4=5:6),
warning="Detected 2 column names but.*4.*Added 2 extra default column names at the end")
test(904, fread("A,B,C,D\n1,3,foo,5\n2,4,barbaz,6"), DT<-data.table(A=1:2,B=3:4,C=c("foo","barbaz"),D=5:6)) # ok
test(905, fread('A,B,C,D\n1,3,foo,5\n2,4,"barbaz",6'), DT)
test(906, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,r,baz",6'), DT[2,C:="ba,r,baz"])
test(907, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,\\"r,baz",6'), DT[2,C:='ba,\\"r,baz']) # \" protected ok, but \ needs taking off too (TO DO)
test(908, fread("A,B,C\n1,3,\n2,4,\n"), data.table(A=1:2,B=3:4,C=NA)) # where NA is type logical
test(909, fread("
Date and Time,Open,High,Low,Close,Volume
2007/01/01 22:51:00,5683,5683,5673,5673,64
2007/01/01 22:52:00,5675,5676,5674,5674,17
2007/01/01 22:53:00,5674,5674,5673,5674,42
")$Open, c(5683L,5675L,5674L)) # , splits all rows consistently and is also higher precedence than ' '
# blanks when testing if header row is all character
test(910, fread("
02-FEB-2009,09:55:04:962,26022009,2500,PE,36,500,44,200,11850,1100,,2865.60
02-FEB-2009,09:55:04:987,26022009,2800,PE,108.75,200,111,50,11700,1450,,2865.60
02-FEB-2009,09:55:04:939,26022009,3100,CE,31.1,3000,36.55,200,3500,5250,,2865.60
")$V13, rep(2865.60,3))
test(911, fread("02-FEB-2009,09:55:04:962,26022009,2500,PE,36,500,44,200,11850,1100,,2865.60
02-FEB-2009,09:55:04:987,26022009,2800,PE,108.75,200,111,50,11700,1450,,2865.60
02-FEB-2009,09:55:04:939,26022009,3100,CE,31.1,3000,36.55,200,3500,5250,,2865.60")$V13, rep(2865.60,3))
# Check manually setting separator
txt = "A;B;C|D,E\n1;3;4|5,6\n2;4;6|8,10\n"
test(912, names(fread(txt)), c("A","B","C|D,E")) # ; separates it more
test(913.1, fread(txt,sep=";"), data.table(A=1:2,B=3:4,"C|D,E"=c("4|5,6","6|8,10")))
test(913.2, fread(txt,sep=","), data.table("A;B;C|D"=c("1;3;4|5","2;4;6|8"), "E"=c(6L,10L)))
test(914, fread(txt,sep="*"), data.table("A;B;C|D,E"=c("1;3;4|5,6","2;4;6|8,10")))
test(915, fread(txt,sep="\n"), data.table("A;B;C|D,E"=c("1;3;4|5,6","2;4;6|8,10"))) # like a fast readLines
# Crash bug when RHS is 0 length and := by group, fixed in 1.8.7
# This test was changed in PR#3310 for v1.12.2 and then reverted in dev before release (#3386) back to previous behavior
DT = data.table(a=1:3,b=1:6)
test(916, DT[,newcol:=logical(0),by=a], data.table(a=1:3,b=1:6,newcol=NA))
# roll join error when non last join column is factor, #2450
X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key='state,uid,ts')
Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key='state,uid,ts')
test(917.1, X[Y,roll=TRUE], data.table(id=INT(2001,2003,2002,NA), uid=c(1001,1001,1002,1003), state=factor('CA'), ts=c(51,59,59,57), i.id=INT(3001,3004,3003,3002), key='state,uid,ts'))
test(917.2, X[Y, on=c("id","state"), roll=TRUE], error="Attempting roll join on factor column when joining x.state to i.state")
# NA in join column of type double, #2453.
X = data.table(name=c("Joh","Raf","Jon","Ste","Rob","Smi"),depID=c(NA,31,33,33,34,34),key="depID")
Y = data.table(depID=c(31,33,34,35),depName=c("Sal","Eng","Cle","Mar"),key="depID")
test(918, Y[X], data.table(depID=c(NA,31,33,33,34,34),depName=c(NA,"Sal","Eng","Eng","Cle","Cle"),name=c("Joh","Raf","Jon","Ste","Rob","Smi"),key='depID')) # Y[X] same as merge.data.frame(X,Y,all.x=TRUE)
test(919, X[Y], data.table(name=c("Raf","Jon","Ste","Rob","Smi",NA), depID=c(31,33,33,34,34,35), depName=c("Sal","Eng","Eng","Cle","Cle","Mar"),key='depID'))
test(920, X[Y,nomatch=0], data.table(name=c("Raf","Jon","Ste","Rob","Smi"),depID=c(31,33,33,34,34),depName=c("Sal","Eng","Eng","Cle","Cle"),key='depID'))
test(921, Y[X,nomatch=0], data.table(depID=c(31,33,33,34,34),depName=c("Sal","Eng","Eng","Cle","Cle"),name=c("Raf","Jon","Ste","Rob","Smi"),key='depID'))
# setnames bug on keyed table, when full vector is given and target key isn't the positions in columns 1:length(key)
DT = data.table(a=1:2,b=3:4,c=5:6,key="b")
test(922, setnames(DT,c("A","B","C")), data.table(A=1:2,B=3:4,C=5:6,key="B"))
# vecseq overflow, crash bug #2464
DT = data.table(x=rep(1L,50000),key="x")
test(923, DT[DT], error="Join results in more than 2^31 rows (internal vecseq reached physical limit). Very likely misspecified join.")
X = data.table(x=1:2,y=1:6,key="x")
test(924.1, X[J(c(1,1,1))], setkey(X[rep(1:3,3)],NULL))
test(924.2, X[J(c(1,1,1,1))], error="Join results in 12 rows; more than 10 = nrow(x)+nrow(i). Check for duplicate key values in i each of")
# sorting of 'double' columns not correct for ties (tolerance nuance in C code), #2484
DT = data.table(X=as.POSIXct( c(rep("15DEC2008:00:00:00",10),"15DEC2008:00:00:00",rep("17DEC2008:00:00:00",2)),format="%d%b%Y:%H:%M:%S"),Y=c(1534,61,74,518,519,1519,1520,1524,3127,29250,30609,43,7853))
setkey(DT,X,Y)
test(925, DT[,base::order(X,Y)], 1:nrow(DT))
# Test new dogroup warning for zero length columns in result when other columns are >1, #2478
DT = data.table(a=1:3,b=1:6)
test(926, DT[, if(a==2L) list(42:43,NULL) else list(42L,3.14), by=a], data.table(a=INT(1,2,2,3),V1=INT(42,42,43,42),V2=c(3.14,NA,NA,3.14)), warning="Item 2 of j's result for group 2 is zero length. This will be filled with 2 NAs to match the")
test(927, DT[, if(a==2L) list(42:43,numeric()) else list(42L,3.14), by=a], data.table(a=INT(1,2,2,3),V1=INT(42,42,43,42),V2=c(3.14,NA,NA,3.14)), warning="Item 2 of j's result for group 2 is zero length. This will be filled with 2 NAs to match the")
# And the root cause of #2478: that cbind(DT,1:3) created invalid data.table with empty column
test(928, cbind(data.table(a=1L),b=1:3), data.table(a=1L,b=1:3))
# FR #362 implementation resulted in changing 929 error to warning
# test(929, cbind(data.table(a=1L,b=2:3),c=1:3), error="argument 1 (nrow 2) cannot be recycled without remainder to match longest nrow (3)")
test(929, cbind(data.table(a=1L,b=2:3),c=1:3), data.table(a=1L, b=c(2L,3L,2L), c=1:3), warning="Item 1 has 2 rows but longest item has 3; recycled.")
test(930, cbind(data.table(a=1L,b=2:3),c=1:4), data.table(a=1L,b=INT(2,3,2,3),c=1:4)) # TODO: warning in future
DT = data.table(x=c(1,1,1,1,2,2,3),y=c(1,1,2,3,1,1,2))
DT[,rep:=1L][c(2,7),rep:=c(2L,3L)] # duplicate row 2 and triple row 7
DT[,num:=1:.N] # to group each row by itself
test(931, DT[,cbind(.SD,dup=1:rep),by="num"], data.table(num=INT(1,2,2,3:7,7,7),x=c(1,1,1,1,1,2,2,3,3,3),y=c(1,1,1,2,3,1,1,2,2,2),rep=INT(1,2,2,1,1,1,1,3,3,3), dup=INT(1,1,2,1,1,1,1,1,2,3)))
# New roll=+/- and rollends
DT = data.table(a=INT(1,3,4,4,4,4,7), b=INT(5,5,6,6,9,9,2), v=1:7, key="a,b")
test(932, DT[J(c(0,2,6,8)), roll=+Inf, rollends=TRUE, v], INT(1,1,6,7))
test(933, DT[J(c(0,2,6,8)), roll=-Inf, rollends=TRUE, v], INT(1,2,7,7))
test(934, DT[J(c(0,2,6,8)), roll=+Inf, v], INT(NA,1,6,7))
test(935, DT[J(c(0,2,6,8)), roll=-Inf, v], INT(1,2,7,NA))
test(936, DT[J(c(-10,-1,2,12,13)), roll=5, rollends=TRUE, v], INT(NA,1,1,7,NA))
test(937, DT[J(c(-10,-1,2,12,13)), roll=-5, rollends=TRUE, v], INT(NA,1,2,7,NA))
test(938, DT[J(c(-10,2,6,7,8)), roll="nearest", v], INT(1,1,7,7,7))
test(939, DT[J(c(-10,2,6,7,8)), roll="nearest", rollends=c(TRUE,FALSE), v], INT(1,1,7,7,NA))
test(940, DT[J(c(-10,2,6,7,8)), roll="nearest", rollends=c(FALSE,TRUE), v], INT(NA,1,7,7,7))
test(941, DT[J(c(-10,2,6,7,8)), roll="nearest", rollends=FALSE, v], INT(NA,1,7,7,NA))
# merge all=TRUE with space in a y column name, #2555
X = data.table(a=1:3,b=4:6)
Y = data.table(a=2:4,"d 1"=5:7) # space in Y's column name
test(942, merge(X,Y,all=TRUE,by="a"), data.table(a=1:4,b=INT(4:6,NA),"d 1"=INT(NA,5:7),key="a"))
test(943, merge(X,Y,all.y=TRUE,by="a"), data.table(a=2:4,b=INT(5:6,NA),"d 1"=5:7,key="a"))
# Test error message about NULL type
DT = data.table(NULL)
test(944.1, DT[, foo:=NULL], DT, warning="Column 'foo' does not exist to remove")
test(944.2, DT[,a:=1L], data.table(a=1L)) # can now add columns to an empty data.table from v1.12.2
test(944.3, DT[,aa:=NULL], data.table(a=1L), warning="Column 'aa' does not exist to remove")
test(944.4, DT[,a:=NULL], data.table(NULL))
if (base::getRversion() >= "3.4.0") {
test(944.5, typeof(structure(NULL, class=c("data.table","data.frame"))), 'list', warning="deprecated, as NULL cannot have attributes") # R warns which is good and we like
}
DT = data.table(a=numeric())
test(945, DT[,b:=a+1], data.table(a=numeric(),b=numeric()))
# fread blank column names get default names
test(946, fread('A,B,,D\n1,3,foo,5\n2,4,bar,6\n'), data.table(A=1:2,B=3:4,c("foo","bar"),D=5:6))
test(947, fread('0,2,,4\n1,3,foo,5\n2,4,bar,6\n'), data.table(0:2,2:4,c("","foo","bar"),4:6))
test(948, fread('A,B,C\nD,E,F\n',header=TRUE), data.table(A="D",B="E",C="F"))
test(949, fread('A,B,\nD,E,F\n',header=TRUE), data.table(A="D",B="E",V3="F"))
# +/- with no numbers afterwards should read as character
test(950, fread('A,B,C\n1,+,4\n2,-,5\n3,-,6\n'), data.table(A=1:3,B=c("+","-","-"),C=4:6))
# catching misuse of `:=`
x = data.table(a=1:5)
test(951, x[,{b=a+3; `:=`(c=b)}], error="defined for use in j, once only and in particular ways")
# fread colClasses
input = 'A,B,C\n01,foo,3.140\n002,bar,6.28000\n'
test(952, fread(input, colClasses=c(C="character")), data.table(A=1:2,B=c("foo","bar"),C=c("3.140","6.28000")))
test(953, fread(input, colClasses=c(C="character",A="numeric")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000")))
test(954, fread(input, colClasses=c(C="character",A="double")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000")))
test(955, fread(input, colClasses=list(character="C",double="A")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000")))
test(956, fread(input, colClasses=list(character=2:3,double="A")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000")))
test(957, fread(input, colClasses=list(character=1:3)), data.table(A=c("01","002"),B=c("foo","bar"),C=c("3.140","6.28000")))
test(958, fread(input, colClasses="character"), data.table(A=c("01","002"),B=c("foo","bar"),C=c("3.140","6.28000")))
test(959.1, fread(input, colClasses=c("character","double","numeric")),
data.table(A=c("01","002"), B=c("foo","bar"), C=c(3.14,6.28)),
warning = "Attempt.*column 2 <<B>>.*inherent.*'string'.*down to.*'float64'")
test(959.2, fread(input, colClasses=c("character",NA,"numeric")),
data.table(A=c("01","002"),B=c("foo","bar"),C=c(3.14,6.28)))
test(960, fread(input, colClasses=c("character","double")),
error="colClasses= is an unnamed vector of types, length 2, but there are 3 columns.*you can")
test(961, fread(input, colClasses=1:3), error="colClasses is not type list or character vector")
test(962, fread(input, colClasses=list(1:3)), error="colClasses is type list but has no names")
test(963, fread(input, colClasses=list(character="D")), ans<-data.table(A=1:2, B=c("foo","bar"), C=c(3.14,6.28)), warning="Column name 'D' (colClasses[[1]][1]) not found")
test(964, fread(input, colClasses=c(D="character")), ans, warning="Column name 'D' (colClasses[[1]][1]) not found")
test(965, fread(input, colClasses=list(character=0)), ans, warning="Column number 0 (colClasses[[1]][1]) is out of range [1,ncol=3]")
test(966, fread(input, colClasses=list(character=2:4)), data.table(A=1:2, B=c("foo","bar"), C=c("3.140","6.28000")),
warning="Column number 4 (colClasses[[1]][3]) is out of range [1,ncol=3]")
# Character input more than 4096 bytes (used to be passed through path.expand which imposed the limit), #2649
test(967, nrow(fread( paste( rep('a\tb\n', 10000), collapse=''), header=FALSE)), 10000L)
# Test fread warns about removal of any footer (and autostart skips up over it)
test(968, fread("A,B\n1,3\n2,4\n\nRowcount: 2\n"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2")
test(969, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2")
test(970, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2\n\n"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2")
# fread skip override
input = "some,bad,data\nA,B,C\n1,3,5\n2,4,6\n"
test(971, fread(input), data.table(some=c("A",1:2),bad=c("B",3:4),data=c("C",5:6)))
test(972, fread(input, skip=1), data.table(A=1:2,B=3:4,C=5:6))
test(973, fread(input, skip=2), data.table(V1=1:2,V2=3:4,V3=5:6))
test(974, fread(input, skip=2, header=TRUE), data.table("1"=2L,"3"=4L,"5"=6L))
test(975, fread(input, skip="B"), data.table(A=1:2,B=3:4,C=5:6))
input = "\n\nA,B\n1,3\n2,4\n\nC,D\n5,7\n6,8\n\nE,F\n9,11\n10,12\n" # 3 tables in one file
test(976, fread(input), data.table(A=1:2, B=3:4), warning="Stopped early on line 6.*First discarded non-empty line: <<C,D>>")
test(977, fread(input, skip="C"), ans<-data.table(C=5:6, D=7:8), warning="Stopped early on line 10.*First discarded non-empty line: <<E,F>>")
test(978.1, fread(input, skip="D"), ans, warning="Stopped.*line 10.*<<E,F>>")
test(978.2, fread(input, skip=",F"), data.table(E=9:10, F=11:12))
test(978.3, fread(input, skip=9), data.table(E=9:10, F=11:12))
# mixed add and update in same `:=` bug/crash, #2528 and #2778
DT = data.table(x=rep(1:2, c(3,2)), y=6:10)
DT[, z:=.GRP, by=x] # first assignment
test(979, DT[, `:=`(z=.GRP, w=2), by=x], data.table(x=INT(1,1,1,2,2),y=6:10,z=INT(1,1,1,2,2),w=2)) # mixed update and add
# and example from http://stackoverflow.com/a/14732348/403310 :
dt1 = fread("Date,Time,A,B
01/01/2013,08:00,10,30
01/01/2013,08:30,15,25
01/01/2013,09:00,20,20
02/01/2013,08:00,25,15
02/01/2013,08:30,30,10
02/01/2013,09:00,35,5")
dt2 = fread("Date,A,B,C
01/01/2013,100,300,1
02/01/2013,200,400,2")
setkey(dt1, "Date")
setkey(dt2, "Date")
test(980, dt1[dt2, `:=`(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)],
data.table(A=INT(110,115,120,225,230,235),B=INT(330,325,320,415,410,405),C=rep(1:2,each=3)))
DT = data.table(A=1:2,B=3:4,C=5:6)
test(981, DT[,`:=`(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], error="Supplied 2 items to be assigned to group 1 of size 1 in column 'B'")
DT = data.table(A=1:2,B=3:4,C=5:6)
test(982, DT[,`:=`(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A],
data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L))) # Also note that G is not yet iterative. In future: c(12,14)
# rbindlist binding factors, #2650
test(983, rbindlist(list(data.table(factor(c("A","A","B","C","A"))), data.table(factor(c("B","F","A","G"))))), data.table(V1=factor(c("A","A","B","C","A","B","F","A","G"))))
test(984, rbindlist(list(data.table(factor(c("A","B"))), data.table(c("C","A")))), data.table(factor(c("A","B","C","A"))))
test(985, rbindlist(list(data.table(c("A","B")), data.table(factor(c("C","A"))))), data.table(factor(c("A","B","C","A"))))
# with NA
test(985.1, rbindlist(list(data.table(factor(c("A","B"))), data.table(factor(c("C",NA))))), data.table(factor(c("A","B","C",NA))))
test(985.2, rbindlist(list(data.table(c("A","B")), data.table(factor(c("C",NA))))), data.table(factor(c("A","B","C",NA))))
## Allow unique/duplicated to accept custom colum combination to query for
## uniqueness
dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B")
df <- as.data.frame(dt)
test(986, unique(dt, by=key(dt)), dt[!duplicated(df[, key(dt)]),])
test(987, unique(dt, by='A'), dt[!duplicated(df[, 'A'])])
test(988, unique(dt, by='B'), dt[!duplicated(df[, 'B'])])
test(989, unique(dt, by='C'), dt[!duplicated(df[, 'C'])])
test(990, unique(dt, by=c('B', 'C')), dt[!duplicated(df[, c('B', 'C')])])
test(991, unique(dt, by=NULL), dt[!duplicated(df)])
test(991.1, unique(dt, by=4), error="specify non existing column*.*4")
test(991.2, unique(dt, by=c(1,3.1)), error="is type 'double' and one or more items in it are not whole integers")
test(991.3, unique(dt, by=2:3), dt[!duplicated(df[,c('B','C')])])
test(991.4, unique(dt, by=c('C','D','E')), error="specify non existing column*.*D")
# :=NULL on factor column in empty data.table, #114
DT = data.table(A = integer(), B = factor())
test(992, DT[, B:=NULL], data.table(A=integer()))
# That including FUN= works in j=lapply, #110
DT = as.data.table(iris)
test(993, DT[, lapply(.SD, function(x) sum(!is.na(x), na.rm=TRUE)), by = Species],
DT[, lapply(.SD, FUN=function(x) sum(!is.na(x), na.rm=TRUE)), by = Species])
# fread more than 50,000 columns, the R_PPSSIZE limit in Defn.h
# Takes too long for routine use. TO DO: move to a long running stress test script
#M = matrix(1,nrow=3,ncol=200000)
#f = tempfile()
#write.csv(M,f,row.names=FALSE)
#test(994, fread(f)[[200000]], rep(1L,3))
#unlink(f)
# CJ with `sorted = FALSE` option
DT <- data.table(x=rep(3:5, each=4), y=rep(1:6, each=2), z=1:12)
setkey(DT, x, y)
OUT <- DT[J(c(5,5,3,3), c(5,1,5,1))]
test(995, DT[CJ(c(5,3), c(5,1), sorted=FALSE)], OUT)
# CJ with ordered factor
xx <- factor(letters[1:2], ordered=TRUE)
yy <- sample(2L)
yy_sort = base::sort.int(yy)
old = options(datatable.CJ.names=FALSE)
test(996.01, CJ(xx, yy), setkey(data.table(rep(xx, each=2L), rep(yy_sort, 2L))))
test(996.02, CJ(a = xx, yy), setkey(data.table(a = rep(xx, each=2L), V2 = rep(yy_sort, 2L))))
options(datatable.CJ.names=TRUE)
test(996.03, CJ(xx, yy), setkey(data.table(xx = rep(xx, each=2L), yy = rep(yy_sort, 2L))))
options(old)
# #3597 -- CJ properly informs sorted can't apply to list input
test(996.04, CJ(list(1:2, 3L)), error = "non-atomic, which can't be sorted")
test(996.05, CJ(list(1:2, 3), 4:6, sorted = FALSE),
data.table(V1 = list(1:2, 1:2, 1:2, 3, 3, 3), V2 = rep(4:6, 2L)))
test(996.06, CJ(4:6, list(1:2, 3), sorted = FALSE),
data.table(V1 = rep(4:6, each = 2L), V2 = rep(list(1:2, 3), 3L)))
test(996.07, CJ(1:2, list(1:2, 3), 4:5, sorted = FALSE),
data.table(V1 = rep(1:2, each = 4L), V2 = rep(rep(list(1:2, 3), each = 2L), 2L), V3 = rep(4:5, 4L)))
test(996.08, CJ(expression(1)), error = "element 1 is non-atomic")
test(996.09, CJ(expression(2), 3, sorted = FALSE), error = "Type 'expression' not supported")
## complex input support (can't handle sorted yet)
test(996.10, CJ(z = 0:1 + (0:1)*1i, b = 1:3, sorted = FALSE),
data.table(z = rep(0:1, each=3L) + rep(0:1, each=3L)*1i, b = rep(1:3, 2)))
test(996.11, CJ(b = 1:3, z = 0:1 + (0:1)*1i, sorted = FALSE),
data.table(b = rep(1:3, each = 2L), z = rep(0:1, 3) + rep(0:1, 3)*1i))
# That CJ orders NA consistently with setkey and historically, now it doesn't use setkey.
# NA must always come first in data.table throughout, since binary search relies on that internally.
test(997, DT <- CJ(c(1,3,NA,2), 5:6), setkey(setkey(copy(DT),NULL))) # double setkey to really rebuild key
test(998, DT <- CJ(as.integer(c(1,3,NA,2)), 5:6), setkey(setkey(copy(DT),NULL)))
test(999, DT <- CJ(c("A","B",NA,"C"), 5:6), setkey(setkey(copy(DT),NULL)))
test(1000, DT <- CJ(c(1,NA,3), c("B",NA,"A"), c(5L,NA_integer_)), setkey(setkey(copy(DT),NULL)))
test(1001, DT <- CJ(c(1,NA,3)), setkey(setkey(copy(DT),NULL))) # The 1 column case is switched inside CJ() so test that too.
# merge all=TRUE when y is empty, #2633
a = data.table(P=1:2,Q=3:4,key='P')
b = data.table(P=2:3,R=5:6,key='P')
test(1002, merge(a,b[0],all=TRUE), data.table(merge.data.frame(a,b[0],all=TRUE),key='P'))
a = data.table(c=c(1,2),key='c')
b = data.table(c=3,key='c')
test(1003, merge(a,b[0],all=TRUE), data.table(merge.data.frame(a,b[0],all=TRUE),key='c'))
# setkey with backticks, #2452
DT = data.table("Date and Time"=1:3,x=4:6)
test(1004, setkey(copy(DT),`Date and Time`), setkey(DT,"Date and Time"))
# rbinding with duplicate names, NA or "", #2384 and #2726
DT = data.table(a=1:3,b=4:6,b=7:9,c=10:12)
test(1005, rbind(DT,DT), data.table(a=rep(1:3,2),b=rep(4:6,2),b=rep(7:9,2),c=rep(10:12,2)))
M <- mtcars
colnames(M)[11] <- NA
test(1006, print(as.data.table(M), nrows=10), output="gear NA.*1: 21.0")
# rbinding factor with non-factor/character
DT1 <- data.table(x=1:5, y=factor("a"))
DT2 <- data.table(x=1:5, y=2)
test(1007, rbindlist(list(DT1, DT2)), data.table(x = c(1:5, 1:5), y = factor(c(rep('a', 5), rep('2', 5)), levels = c('a', '2'))))
test(1008, rbindlist(list(DT2, DT1)), data.table(x = c(1:5, 1:5), y = factor(c(rep('2', 5), rep('a', 5)))))
# rbindlist different types
DT1 <- data.table(a = 1L, b = 2L)
DT2 <- data.table(a = 2L, b = 'a')
DT3 <- data.table(a = 2L, b = 2.5)
test(1008.1, rbindlist(list(DT1, DT2)), data.table(a = c(1L,2L), b = c('2', 'a')))
test(1008.2, rbindlist(list(DT1, DT3)), data.table(a = c(1L,2L), b = c(2, 2.5)))
# optimized mean() respects na.rm=TRUE by default, as intended
DT = data.table(a=c(NA,NA,FALSE,FALSE), b=c(1,1,2,2))
test(1009, DT[,list(mean(a), sum(a)),by=b], data.table(b=c(1,2),V1=c(NA,0),V2=c(NA_integer_,0L))) # sum(logical()) should be integer, not real
# an fread error shouldn't hold a lock on the file on Windows
cat('A,B\n1,2\n3\n5,6\n', file=(f<-tempfile()))
test(1010.1, fread(f,logical01=TRUE), ans<-data.table(A=TRUE, B=2L), warning=(txt<-"Stopped early on line 3.*Expected 2 fields but found 1.*fill.*TRUE.*<<3>>"))
test(1010.2, fread(f,logical01=TRUE), ans, warning=txt)
cat('7\n8,9',file=f,append=TRUE) # that append works after error
test(1010.3, fread(f,fill=TRUE), data.table(A=INT(1,3,5,7,8), B=INT(2,NA,6,NA,9)))
test(1010.4, fread(f,logical01=TRUE), ans, warning=txt)
cat('A,B\n1,2\n3\n5,6\n', file=f) # that overwrite works after error
test(1010.5, fread(f,fill=TRUE), data.table(A=INT(1,3,5), B=INT(2,NA,6)))
test(1010.6, fread(f,logical01=TRUE), ans, warning=txt)
unlink(f) # that file can be removed after error
test(1010.7, !file.exists(f))
# detection of unescaped quotes, quote rule 3
test(1011, fread('A,B\n"aa",1\n"bb,2\n"cc",3\n'), data.table(A=c('aa', '"bb', 'cc'), B=1:3), warning=w<-"resolved improper quoting")
test(1012, fread('A,B\n"aa",1\n"bb"",2\n"cc",3\n'), data.table(A=c("aa", "bb\"", "cc"), B=1:3), warning=w)
# integer64 control to fread
test(1013, fread("A,B\n123,123\n", integer64="integer"), error="integer64='%s' which isn't 'integer64'|'double'|'numeric'|'character'")
test(1014, fread("A,B\n123456789123456,21\n", integer64="character"), data.table(A="123456789123456",B=21L))
test(1015, fread("A,B\n123456789123456,21\n", integer64="double"), data.table(A=as.double("123456789123456"),B=21L))
# and that mid read bumps respect integer64 control too ..
x = sample(1:1000,2100,replace=TRUE) # 2100 > 100 JUMPLINES * 10 NJUMP * 2 spacing
DT = data.table( A=as.character(x), B=1:100)
DT[115, A:="123456789123456"] # row 115 is outside the 100 rows at 10 points.
fwrite(DT,f<-tempfile())
test(1016.1, sapply(suppressWarnings(fread(f,verbose=TRUE)),"class"), c(A="integer64", B="integer"),
output="Rereading 1 columns.*Column 1.*A.*bumped.*int32.*int64.*<<123456789123456>>")
# suppressWarnings for 'bit64 is not installed' warning on AppVeyor where we (correctly) don't install Suggests
test(1016.2, fread(f, colClasses = c(A="numeric"), verbose=TRUE), copy(DT)[,A:=as.numeric(A)], output="Rereading 0 columns")
DT[90, A:="321456789123456"] # inside the sample
write.table(DT,f,sep=",",row.names=FALSE,quote=FALSE)
if (test_bit64) test(1017.1, fread(f), copy(DT)[,A:=as.integer64(A)])
test(1017.2, fread(f, integer64="character"), DT)
unlink(f)
# ERANGE errno handled, #106 #4165
test(1018.1, identical(fread("1.46761e-313\n"), data.table(V1=1.46761e-313)))
test(1018.2, identical(fread("1.46761e+313\n"), data.table(V1=1.46761e+313)))
test(1019, fread("A\n1.23456789123456789123456999\n"), data.table(A=1.234567891234568))
# crash assigning to row 0, #2754
DT = data.table(A=1:5,B=6:10)
test(1020, DT[0,A:=6L], DT)
test(1021, DT[NA,A:="foo"], DT)
test(1022, DT[5:0,A:=21L], data.table(A=21L, B=6:10))
test(1023, DT[c(1,2,NA,3), B:=42L], data.table(A=21L, B=c(42L,42L,42L,9:10)))
test(1024, DT[6,A:=0L], error="i[[]1[]] is 6 which is out of range [[]1,nrow=5[]]")
# crash assigning to duplicated column names/numbers, #2751
test(1024.1, DT[,c("B","B"):=NULL], error="Can't assign to the same column twice in the same query (duplicates detected).")
test(1024.2, DT[,c(1,2,1):=NULL], error="Can't assign to the same column twice in the same query (duplicates detected).")
# as.data.table.table, #361
DF <- data.frame(x = c(1,1,2,NA,1,2), y = c("b", "b", "b", "a", "c", "a"), z = c(1,1,1,1,1,2), stringsAsFactors=FALSE )
tab1 <- as.data.table(as.data.frame(table(DF$x), stringsAsFactors=FALSE)); setattr(tab1, 'names', c("V1", "N"))
tab2 <- as.data.table(as.data.frame(table(DF$x, DF$y), stringsAsFactors=FALSE)); setattr(tab2, 'names', c("V1", "V2", "N"))
tab3 <- as.data.table(as.data.frame(table(DF$x, DF$y, DF$z), stringsAsFactors=FALSE)); setattr(tab3, 'names', c("V1", "V2", "V3", "N"))
test(1025, as.data.table(table(DF$x)), tab1)
test(1026, as.data.table(table(DF$x, DF$y)), tab2)
test(1027.1, as.data.table(table(DF$x, DF$y, DF$z)), tab3)
# catch printing of data.table(table()), #109 (as.data.table should be used instead)
# new, updated 14th Feb, 2015. data.table(table) now redirects to as.data.table
test(1027.2, data.table(table(1:99)), as.data.table(table(1:99)))
# data.table() and rbindlist() in v1.8.11 caught and removed the dim attribute.
# from 1.12.4 the dim attribute is preserved and the print method outputs "multi-column" for such columns (better than error)
test(1027.3, {DT<-data.table(table(1:99));setattr(DT[[1]],"dim",99L);print(DT,nrows=100)},
output="<multi-column>")
# as.data.table.x where x is integer, numeric, etc...
set.seed(45)
test(1028, as.data.table(x<-sample(5)), data.table(V1=x))
test(1029, as.data.table(x<-as.numeric(x)), data.table(V1=x))
test(1030, as.data.table(x<-as.Date(x, origin="2013-01-01")), data.table(V1=x))
test(1031, as.data.table(x<-factor(sample(5))), data.table(V1=x))
test(1032, as.data.table(x<-factor(x, ordered=TRUE)), data.table(V1=x))
test(1033, as.data.table(x<-as.logical(sample(0:1, 5, TRUE))), data.table(V1=x))
test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x))
#########################################
# All melt.data.table tests go in here #
#########################################
{
# We run these routinely, in dev by cc(), on Travis (coverage) and on CRAN
set.seed(45)
N=18L # increased in v1.12.2 from 6 to 18 to get NA in f_1 for coverage
DT <- data.table(
i_1 = c(1:(N-1L), NA),
i_2 = c(NA,(N:(2L*N-2L))),
f_1 = factor(sample(c(letters[1:3], NA), N, TRUE)),
c_1 = sample(c(letters[1:3], NA), N, TRUE),
d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"),
d_2 = as.Date(6:1, origin="2012-01-01"))
DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c] # generate list cols
DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c]
test(1035.010, melt(DT, id.vars=1:2, measure.vars=3:4), melt(DT, id.vars=c("i_1", "i_2"), measure.vars=c("f_1", "c_1")))
test(1035.011, melt(DT, id.vars=as.raw(0), measure.vars=3:4),
error="Unknown 'id.vars' type raw")
test(1035.012, melt(DT, id.vars=1:3, measure.vars=as.raw(0)),
error="Unknown 'measure.vars' type raw")
ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1"))
ans1[, value := DT$l_1]
test(1035.02, melt(DT, id.vars=c("i_1", "i_2", "l_2"), measure.vars=c("l_1")), ans1)
# melt retains attributes if all are of same type (new)
ans2 = data.table(c_1=DT$c_1, variable=rep(c("d_1", "d_2"), each=N), value=as.Date(c(DT$d_1, DT$d_2)))[!is.na(value)]
test(1035.03, melt(DT, id.vars=4, measure.vars=5:6, na.rm=TRUE, variable.factor=FALSE), ans2)
DT2 <- data.table(x=1:5, y=1+5i) # unimplemented class
test(1035.040, melt(DT2, id.vars=1), error="Unknown column type 'complex'")
test(1035.041, melt(DT2, id.vars=as.raw(0)), error="Unknown 'id.vars' type raw")
# more tests
DT[, f_2 := factor(sample(letters, N), ordered=TRUE)]
DT[, id := 1:N]
ans1 = cbind(melt(DT, id.vars="id", measure.vars=5:6, value.name="value1"), melt(DT, id.vars=integer(0), measure.vars=7:8, value.name="value2")[, variable:=NULL])
levels(ans1$variable) = as.character(1:2)
test(1035.050, ans1, melt(DT, id.vars="id", measure.vars=list(5:6, 7:8)))
## use numeric index
test(1035.051, ans1, melt(DT, id.vars="id", measure.vars=list(c(5, 6), c(7, 8))))
test(1035.052, melt(DT, id.vars="id", measure.vars=list(as.raw(0))),
error="Unknown 'measure.vars' type raw")
test(1035.06, ans1, melt(DT, id.vars="id", measure.vars=list(5:6, 7:8), na.rm=TRUE)) # should've no effect
test(1035.07, ans1, melt(DT, id.vars="id", measure.vars=patterns("d_", "l_")))
# melt retains ordered factors!
test(1035.08, melt(DT, id.vars="id", measure.vars=c("f_1", "f_2"), value.factor=TRUE)$value, factor(c(as.character(DT$f_1), as.character(DT$f_2)), ordered=TRUE))
# if measure is integer(0) just returns a duplicated data.table with all idcols
test(1035.09, melt(DT, id.vars=1:6, measure.vars=integer(0)), shallow(DT, 1:6))
# measure.var list with single entry recycles to maximum length
ans = cbind(melt(DT, id.vars="id", measure.vars=c("c_1", "c_1"))[, variable := NULL], melt(DT, id.vars=integer(0), measure.vars=c("f_1", "f_2")))
setnames(ans, c("id", "value1", "variable", "value2"))
setcolorder(ans, c("id", "variable", "value1", "value2"))
levels(ans$variable) = as.character(1:2)
test(1035.10, melt(DT, id.vars="id", measure.vars=list(c("c_1", "c_1"), c("f_1", "f_2"))), ans)
# non ordered factors
DT[, f_2 := factor(sample(letters, N), ordered=FALSE)]
test(1035.11, melt(DT, id.vars="id", measure.vars=c("f_1", "f_2"), value.factor=TRUE)$value, factor(c(as.character(DT$f_1), as.character(DT$f_2)), ordered=FALSE))
# test to ensure attributes on non-factor id-columns are preserved after melt; was test 1222
DT <- data.table(x=1:3, y=letters[1:3], z1=8:10, z2=11:13)
setattr(DT$x, 'foo', 'bla1')
setattr(DT$y, 'bar', 1:4)
test(1035.12, attr(melt(DT, id.vars=1:2)$x, "foo"), "bla1")
test(1035.13, attr(melt(DT, id.vars=1:2)$y, "bar"), 1:4)
# bug #699 - melt segfaults when vars are not in dt; was test 1316
x = data.table(a=c(1,2),b=c(2,3),c=c(3,4))
test(1035.14, melt(x, id.vars="d"), error="One or more values")
test(1035.150, melt(x, measure.vars="d"), error="One or more values")
test(1035.151, melt(x, measure.vars=3L),
ans <- data.table(a=c(1, 2), b=c(2, 3), variable=factor('c'), value=c(3, 4)))
test(1035.152, melt(x, measure.vars=as.raw(0)), error="Unknown 'measure.vars' type raw")
test(1035.153, melt(x, measure.vars=3L, verbose=TRUE), ans,
output="'id.vars' is missing. Assigning all.*Assigned 'id.vars' are [[]a, b[]]")
test(1035.16, melt(x, id.vars="a", measure.vars="d"), error="One or more values")
test(1035.17, melt(x, id.vars="d", measure.vars="a"), error="One or more values")
# fix for #780; was test 1371
DT = data.table(x=rep(c("a","b","c"),each=3), y=c(1,3,6), v=1:9)
foo = function(input, by, var) {
melt(input, id.vars = by, measure.vars=var)
}
test(1035.18, foo(DT, by="x"), data.table(x=rep(DT$x, 2L), variable=factor(rep(c("y", "v"), each=9L), levels=c("y", "v")), value=c(DT$y, DT$v)),
warning="'measure.vars' [[]y, v[]] are not all of the same type.*molten data value column will be of type 'double'.*'double'")
test(1035.19, foo(DT), data.table(x=rep(DT$x, 2L), variable=factor(rep(c("y", "v"), each=9L), levels=c("y", "v")), value=c(DT$y, DT$v)),
warning=c("id.vars and measure.vars are internally guessed.*this case are columns [[]x[]]",
"'measure.vars' [[]y, v[]] are not all of the same type.*'double'.*'double'"))
# Fix for #1055; was test 1495
DT <- data.table(A = 1:2, B = 3:4, D = 5:6, D = 7:8)
test(1035.20, melt(DT, id.vars=1:2), data.table(A=1:2, B=3:4,
variable=factor(rep(1L, 4L), labels="D"), value=5:8))
# segfault of unprotected var caught with the help of address sanitizer; was test 1509
set.seed(1)
val = sample(c(1:5, NA), 1e4L, TRUE)
dt <- setDT(replicate(100L, val, simplify=FALSE))
## to ensure there's no segfault...
ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE)
test(1035.21, ans, ans)
# improper levels fix, #1359; was test 1563
dt = data.table(id=1:3, x=NA_character_, y=c('a', NA_character_, 'c'))
test(1035.220, melt(dt, id.vars="id", na.rm=TRUE), data.table(id=c(1L,3L), variable=factor(c("y", "y")), value=c("a", "c")))
test(1035.221, melt(dt, id.vars=c("id", "id"), na.rm=TRUE),
output = 'Duplicate column names found',
data.table(id=c(1L,3L), id.1=c(1L, 3L), variable=factor(c("y", "y")), value=c("a", "c")))
# fixing segfault due to negative id and measure vars that I detected by accident; was test 1569
dt = data.table(x=1:5, y=6:10, z=11:15)
test(1035.23, melt(dt, id.vars=-1, measure.vars=NULL), error="One or more values in 'id.vars'")
test(1035.24, melt(dt, id.vars=-1, measure.vars=-1), error="One or more values in 'id.vars'")
test(1035.25, melt(dt, id.vars=NULL, measure.vars=-1), error="One or more values in 'measure.vars'")
test(1035.26, melt(dt, id.vars=5, measure.vars=-1), error="One or more values in 'id.vars'")
test(1035.27, melt(dt, id.vars=1, measure.vars=-1), error="One or more values in 'measure.vars'")
if (test_R.utils) {
# dup names in variable used to generate malformed factor error and/or segfault, #1754; was test 1570
R.utils::decompressFile(testDir("melt_1754.R.gz"), tt<-tempfile(), remove=FALSE, FUN=gzfile, ext=NULL)
source(tt, local=TRUE) # creates DT
test(1036.01, dim(DT), INT(1,327))
test(1036.02, dim(ans<-melt(DT, 1:2)), INT(325,4),
warning="'measure.vars' [[]Geography, Estimate; SEX AND AGE - Total population, Margin of Error; SEX AND AGE - Total population, Percent; SEX AND AGE - Total population, [.][.][.][]] are not all of the same type.*the molten data value column will be of type 'character'.*not of type 'character' will be coerced too")
test(1036.03, length(levels(ans$variable)), 317L)
test(1036.04, levels(ans$variable)[c(1,2,316,317)],
tt <- c("Geography",
"Estimate; SEX AND AGE - Total population",
"Percent; HISPANIC OR LATINO AND RACE - Total housing units",
"Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units"))
test(1036.05, range(as.integer(ans$variable)), INT(1,317))
test(1036.06, as.vector(table(table(as.integer(ans$variable)))), INT(309,8))
test(1036.07, sapply(ans, class), c(Id="character",Id2="integer",variable="factor",value="character"))
test(1036.08, dim(ans<-melt(DT, 1:2, variable.factor=FALSE)), INT(325,4),
warning="'measure.vars' [[]Geography, Estimate;.*[.][.][.][]].*'character'.*'character'")
test(1036.09, sapply(ans, class), c(Id="character",Id2="integer",variable="character",value="character"))
test(1036.10, ans$variable[c(1,2,324,325)], tt)
# more from #1754; was test 1571
DT = fread(testDir("melt_1754_synth.csv.bz2"))
test(1037.101, names(DT)[duplicated(names(DT))], c("smoking75","smoking80","smoking88"))
test(1037.102, dim(ans<-melt(DT, id.vars=c("state","income","retailprice","percent_15_19","beercons"), measure.vars=patterns("^smoking"))), INT(1326,7))
test(1037.103, print(ans[c(1,1326)]), output="state.*income.*retailprice.*percent_15_19.*beercons.*variable.*value.*1.*9.6.*89.34.*smoking88.*smoking00.*41.6")
}
# more from #1754; was test 1572
DT = setDT(data.frame("Time.point" = seq(0, 6), "Time.(h)" = c(0.0, 0.5, 1.0, 3.0, 5.0, 7.0, 24.0),
"NEW.ME" = runif(7), "NEW.ME" = runif(7), check.names = FALSE))
test(1037.201, dim(melt(DT, c("Time.point", "Time.(h)"), na.rm=TRUE)), INT(14, 4))
DT = setDT(data.frame("Time.point" = seq(0, 6), "Time.(h)" = c(0.0, 0.5, 1.0, 3.0, 5.0, 7.0, 24.0),
"NEW.ME" = runif(7), "NEW.ME" = runif(7), "NEW.ME" = runif(7), "NEW.ME" = runif(7), "NEW.ME" = runif(7),
"NEW.ME" = runif(7), "NEW.ME" = runif(7), "NEW.ME" = runif(7), "NEW.MER" = runif(7), "F050" = runif(7),
"NEW.MER" = runif(7), "F16-42-123p123C" = runif(7), "F16-42-123p123C" = runif(7), "NEW.MER" = runif(7),
"F16-42-123p123C" = runif(7), check.names = FALSE))
test(1037.202, unique(names(DT)[duplicated(names(DT))]), c("NEW.ME","NEW.MER","F16-42-123p123C"))
test(1037.203, dim(melt(DT, c("Time.point", "Time.(h)"), na.rm = TRUE)), INT(105,4))
# more from #1754; was test 1573
DT = fread(
"month,Record high,Average high,Daily mean,Average low,Record low,Average precipitation,Average rainfall,Average snowfall,Average precipitation,Average rainy,Average snowy,Mean monthly sunshine hours
Jan,12.8,-5.4,-8.9,-12.4,-33.5,73.6,28.4,45.9,15.8,4.3,13.6,99.2
Feb,15,-3.7,-7.2,-10.6,-33.3,70.9,22.7,46.6,12.8,4,11.1,119.5
Mar,25.9,2.4,-1.2,-4.8,-28.9,80.2,42.2,36.8,13.6,7.4,8.3,158.8
Apr,30.1,11,7,2.9,-17.8,76.9,65.2,11.8,12.5,10.9,3,181.7
May,34.2,19,14.5,10,-5,86.5,86.5,0.4,12.9,12.8,0.14,229.8
Jun,34.5,23.7,19.3,14.9,1.1,87.5,87.5,0,13.8,13.8,0,250.1
Jul,36.1,26.6,22.3,17.9,7.8,106.2,106.2,0,12.3,12.3,0,271.6
Aug,35.6,24.8,20.8,16.7,6.1,100.6,100.6,0,13.4,13.4,0,230.7
Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1")
test(1037.301, print(melt(DT, id.vars="month", verbose=TRUE)), output="'measure.vars' is missing.*Assigned 'measure.vars' are [[]Record high, Average high, Daily mean, Average low, ...[]].*1:.*Jan.*Record high.*12.8.*108:.*Sep.*sunshine hours.*174.1")
# coverage of reworked fmelt.c:getvarcols, #1754; was test 1574
# missing id satisfies data->lvalues!=1 at C level to test those branches
x = data.table(x1=1:2, x2=3:4, y1=5:6, y2=7:8, z1=9:10, z2=11:12)
test(1037.401, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"))), INT(4,5))
test(1037.402, ans$variable, factor(c("1","1","2","2")))
test(1037.403, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"), variable.factor=FALSE)), INT(4,5))
test(1037.404, ans$variable, c("1","1","2","2"))
x[, c("y1","z1"):=NA]
test(1037.405, dim(melt(x, measure.vars=patterns("^y", "^z"))), INT(4,5))
test(1037.406, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"), na.rm=TRUE)), INT(2,5))
test(1037.407, ans$variable, factor(c("1","1")))
test(1037.408, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"), na.rm=TRUE, variable.factor=FALSE)), INT(2,5))
test(1037.409, ans$variable, c("1","1"))
test(1037.410, melt(data.table(NULL), verbose=TRUE), data.table(NULL),
output="ncol(data) is 0. Nothing to melt")
test(1037.411, melt(x, id.vars=1L, measure.vars=2L, variable.name=c('a', 'b')),
error="'variable.name' must be a character/integer")
test(1037.412, melt(x, id.vars=1L, measure.vars=list(2L), value.name=c('a', 'b')),
error="When 'measure.vars' is a list, 'value.name'")
test(1037.413, melt(x, id.vars=1L, measure.vars=2L, value.name=c('a', 'b')),
error="When 'measure.vars' is either not specified or a character/integer")
x[ , r := as.raw(c(0, 1))]
test(1037.414, melt(x, id.vars='x1', measure.vars='r'),
error="Unknown column type 'raw' for column 'r'")
}
# sorting and grouping of Inf, -Inf, NA and NaN, #117, #112 & #105
DT <- data.table(x = rep(c(1, NA, NaN, Inf, -Inf), each=2))
OUT <- data.table(x=c(1, NA, NaN, Inf, -Inf), N=2L)
test(1040.1, DT[, .N, by=x], OUT)
DT <- data.table(y =c(NA, Inf, NA, -Inf, -Inf, NaN, Inf, 1, NaN, 1))
OUT <- data.table(y = c(NA, Inf, -Inf, NaN, 1), N=2L)
test(1040.2, DT[, .N, by=y], OUT)
# rbindlist on *data.frame* input, #119. Somehow not test for this. (Although, #119 was the same as #2650 fixed in v1.8.9).
l <- list(u1=data.frame(i1=c('a', 'b', 'c'), val=1:3, stringsAsFactors=TRUE),
u2=data.frame(i1=c('d', 'e'), val=4:5, stringsAsFactors=TRUE))
test(1041, rbindlist(l), data.table(i1=factor(letters[1:5]),val=1:5))
# negative indexing in *i* leads to crash/wrong aggregates when dogroups is called. bug #2697
DT = data.table(x = c(1,2,3,4,5), group = c(1,1,2,2,3))
test(1042.1, DT[-5, mean(x), by = group], data.table(group=c(1,2), V1=c(1.5, 3.5)))
# Test when abs(negative index) > nrow(dt) - should warn
test(1042.2, DT[-10], DT, warning="Item 1 of i is -10 but there are only 5 rows. Ignoring this and 0 more like it out of 1.")
test(1042.3, DT[c(-5, -10), mean(x), by = group], data.table(group=c(1,2),V1=c(1.5,3.5)), warning="Item 2 of i is -10 but there are only 5 rows. Ignoring this and 0 more like it out of 2.")
test(1042.4, DT[c(-5, -4, -5)], DT[1:3], warning="Item 3 of i is -5 which removes that item but that has occurred before. Ignoring this dup and 0 other dup")
test(1042.5, DT[c(-5, -4, -5, -5, -4)], DT[1:3], warning="Item 3 of i is -5 which removes that item but that has occurred before. Ignoring this dup and 2 other dup")
test(1043.1, DT[c(1, -5)], error="Item 2 of i is -5 and item 1 is 1. Cannot mix positives and negatives.")
test(1043.2, DT[c(-1, NA)], error="Item 1 of i is -1 and item 2 is NA. Cannot mix negatives and NA.")
# crash (floating point exception), when assigning null data.table() to multiple cols, #116
DT = data.table(x=1:5,y=6:10)
test(1044, DT[3,c("x","y"):=data.table()],error="Supplied 2 columns to be assigned an empty list.*use NULL instead.*list\\(list")
test(1045, DT[3,c("x","y"):=list()],error="Supplied 2 columns to be assigned an empty list.*use NULL instead.*list\\(list")
# negative indexing with head() and tail(). bug #2375
d1 = data.table(date = c(1,2,3,4,5), value = c(1,2,3,4,5))
d2 = data.frame(d1)
test(1046, head(d1, -2), as.data.table(head(d2, -2)))
test(1047, head(d1, 2), as.data.table(head(d2, 2)))
test(1048, head(d1, -10), as.data.table(head(d2, -10)))
test(1049, head(d1, 10), as.data.table(head(d2, 10)))
test(1050, tail(d1, -2), as.data.table(tail(d2, -2)))
test(1051, tail(d1, 2), as.data.table(tail(d2, 2)))
test(1052, tail(d1, -10), as.data.table(tail(d2, -10)))
test(1053, tail(d1, 10), as.data.table(tail(d2, 10)))
# negative indexing with `:=` - new feature through fixing of #2697, performs as intended for negative subscripts.
x <- data.table(letters=letters[1:5], number=1:5)
test(1054, x[-(1:3), number := 1L], x[4:5, number := 1L])
test(1055, x[0, number := 1L], x)
# print.data.table heeds digits=2 etc, #2535
DT = data.table(x=rep(c("a","b","c"),each=3), y=(30/7)^(2:10))[, logy := log(y)]
test(1056, print(DT, digits=2), output=" x y logy\n1: a 18 2.9\n2: a 79 4.4\n3: a 337 5.8")
test(1057, print(DT, digits=2, big.mark=","), output=" x y logy\n1: a 18 2.9.*6: b 26,556 10.2\n7: c 113,811 11.6")
# bug #2758 fix - segfault with zeros in i and factors in by
x <- data.table(letters=letters[1:5], factor=factor(letters[1:5]), number=1:5)
test(1058, x[c(0, 3), list(letters, number), by=factor], ans<-x[3,c(2,1,3)])
test(1059, x[c(3, 0), list(letters, number), by=factor], ans)
test(1060, x[c(0, 3), number:=5L, by=factor], ans<-data.table(letters=letters[1:5], factor=factor(letters[1:5]), number=c(1:2,5L,4:5)))
test(1061, x[c(0, 3), number:=5L], ans)
# bug #2440 fix - seqfault when j refers to grouping variable when results are empty
DT = data.table(x=rep(c("a","b"),each=3),v=c(42,42,42,4,5,6))
test(1062, DT[x %in% c('z'),list(x2=x),by=x], output="Empty data.table (0 rows and 2 cols): x,x2")
test(1063, DT[x %in% c('z'),list(vpaste=paste(v,collapse=','),x2=paste(x,x)),by=x], output="Empty data.table (0 rows and 3 cols): x,vpaste,x2")
test(1064, DT[integer(0), list(x2=x), by=x], output="Empty data.table (0 rows and 2 cols): x,x2")
# bug #2445 fix - := fails when subsetting yields NAs and with=FALSE
X = data.table(A=1:3, B=1:6, key="A")
var <- "B"
test(1065, X[J(2:5), (var):=22L], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(22L,4)), key="A"))
# fread single unnamed colClasses
f = "A,B,C,D\n1,3,5,7\n2,4,6,8\n"
test(1066, fread(f,colClasses=c("integer","integer","character")),
error="colClasses= is an unnamed vector of types, length 3, but there are 4 columns.*you can")
test(1067, fread(f,colClasses=c("integer","numeric","character","character")), data.table(A=1:2,B=c(3,4),C=c("5","6"),D=c("7","8")))
test(1068, fread(f,colClasses="character"), data.table(A=c("1","2"),B=c("3","4"),C=c("5","6"),D=c("7","8")))
# fread select and drop
test(1069, fread(f,drop=c("D","B")), data.table(A=1:2,C=5:6))
test(1070, fread(f,drop="E"), fread(f), warning="Column name 'E' (drop[1]) not found")
test(1071, fread(f,select="B",colClasses=list(numeric="C")), data.table(B=3:4))
test(1072, fread(f,select="B",drop="C"), error="not both")
test(1073, fread(f,drop=2:3), fread(f,select=c(1,4))) # tests coercing numeric select as well
# that problem printing duplicate columns doesn't return, #115
DT = data.table(V1 = c(1:1000), V2 = c(10001:11000))
test(1074, DT[, sum(V2), by = V1], output="1000: 1000 11000") # x has two columns both called V1 here
# add test from #2446. Already fixed but add anyway. "names in neworder not found in x: 'colnames with spaces' from merge() when all.y=TRUE"
X = data.table(a=1:3,b=4:6,"c d"=7:9)
Y = data.table(e=10:12,a=2:4)
test(1075, merge(X,Y,by="a",all=TRUE), data.table(a=c(1:4),b=c(4:6,NA),"c d"=c(7:9,NA),e=c(NA,10:12),key="a"))
# Fixes #2670. `by` sometimes incorrect for expressions of keyed columns. When by is used like `by=month(date)`, with key column set to "date", grouping+aggregation would be wrong.
DT = data.table(date=as.Date("2013-01-01")+seq(1,1000,by=10),1:100)
setkey(DT,date)
test(1076, DT[,sum(V2),by=month(date)], DT[, sum(V2), by=list(month(date))])
# just to be sure, second test with another function using sample.
setkey(DT, V2)
ff <- function(x) { set.seed(45); (sample(x)-1) %/% 10}
test(1077, DT[, sum(V2),by=ff(V2)], DT[, sum(V2),by=list(ff(V2))])
# rbindlist should discard names on columns, #103
d = data.frame(x=1:5)
f = function(x) {suppressWarnings(DF<-data.frame(x=x, y=1:10)); setattr(DF$x,"names","a");DF}
l = apply(d, 1, f)
test(1078.1, length(names(l[[1]]$x)), 10L) # test this test is creating names on the column
test(1078.2, length(names(l[[2]]$x)), 10L)
a = rbindlist(l)
test(1078.3, a$x, rep(1:5,each=10)) # a$x would segfault before the fix to rbindlist
# data.table() shouldn't retain column names, root cause of #103
x = 1:5
names(x) = letters[1:5]
test(1079.1, DF<-data.frame(x=x, y=1:10), data.frame(x=rep(1:5,2),y=1:10), warning="row names.*discarded")
test(1079.2, lapply(DF, names), list(x=NULL, y=NULL))
test(1079.3, DT<-data.table(x=x, y=1:10), data.table(x=rep(1:5,2),y=1:10))
test(1079.4, lapply(DT, names), list(x=NULL, y=NULL))
# test from similar #102 for completeness
z = c(a=1,b=2,c=3)
a = data.table(z,x=1:3)
b = rbind(a, data.table(z=2,x=1))
test(1080, b$z, c(1,2,3,2))
# mid row logical detection
test(1081.1, fread("A,B,C\n1,T,2\n",logical01=TRUE), data.table(A=TRUE,B="T",C=2L))
test(1081.2, fread("A,B,C\n1,T,2\n",logical01=FALSE), data.table(A=1L,B="T",C=2L))
# cartesian join answer's key should contain only the columns considered in binary search. Fixes #2677
set.seed(45)
n <- 10
DT1 <- data.table(a=sample(1:3, n, replace=TRUE), b=sample(1:3, n, replace=TRUE), c=sample(1:10, n,replace=TRUE), key=c("a", "b", "c"))
DT2 <- data.table(p=sample(1:3, n, replace=TRUE), q=sample(1:3, n, replace=TRUE), r=sample(1:n), w=sample(1:n))
setkey(DT2, p,q)
ans <- DT1[DT2, nomatch=0, allow.cartesian=TRUE] # NB: DT2 contains duplicate key values so columns c ends up not being sorted
test(1082.1, key(ans), c("a","b"))
test(1082.2, setkeyv(ans, key(ans)), ans) # i.e. key is valid, otherwise re-built warning will be caught
check <- setkey(as.data.table(aggregate(r ~a+b+c, ans, length)), a, b)
test(1083, setkeyv(ans[, list(r = .N), by=key(DT1)], key(ans)), check) # if the key is set properly, then and only then will the aggregation results match with "check"
# Tests for #2531. `:=` loses POSIXct or ITime attribute:
# first test from this SO post: http://stackoverflow.com/questions/15996692/cannot-assign-columns-as-date-by-reference-in-data-table
dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01"))
dt[, group := rep(1:2, 5)]
dt[, min.group.date := as.IDate(min(date)), by = group]
test(1084, class(dt$min.group.date), c("IDate", "Date"))
dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01"))
dt[, group := rep(1:2, 5)]
dt[, min.group.date := min(date), by = group] # don't need to wrap it with as.IDate(.)
test(1085, class(dt$min.group.date), c("IDate", "Date"))
# second test from this SO post: http://stackoverflow.com/questions/14604820/why-does-this-posixct-or-itime-loses-its-format-attribute
DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L))
DT[,x1:=as.ITime(x)]
DT[,`:=`(last.x=tail(x,1L),last.x1=tail(x1,1L)),by=y]
test(1086, class(DT$last.x), c("POSIXct", "POSIXt"))
test(1087, class(DT$last.x1), "ITime")
# Tests 1088-1093 were non-ASCII. Now in DtNonAsciiTests
# print of unnamed DT with >20 <= 100 rows, #97 (RF#4934)
DT <- data.table(x=1:25, y=letters[1:25])
DT.unnamed <- unname(copy(DT))
test(1094.1, capture.output(print(DT.unnamed,nrows=100,class=FALSE)),
c(" ", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 d",
" 5: 5 e", " 6: 6 f", " 7: 7 g", " 8: 8 h", " 9: 9 i",
"10: 10 j", "11: 11 k", "12: 12 l", "13: 13 m", "14: 14 n",
"15: 15 o", "16: 16 p", "17: 17 q", "18: 18 r", "19: 19 s",
"20: 20 t", "21: 21 u", "22: 22 v", "23: 23 w", "24: 24 x",
"25: 25 y", " "))
# print of blank-named DT (eliminating matrix notation)
# #545 (RF#5253) and part of #1523
DT <- data.table(x = 1:3)
setnames(DT, "")
test(1094.2, capture.output(print(DT)), c(" ", "1: 1", "2: 2", "3: 3"))
# DT[!TRUE] or DT[!TRUE, which=TRUE], #98. !TRUE still can be a recycling operation with !(all TRUE)
DT <- data.table(x=1:3, y=4:6)
test(1095.1, DT[!TRUE], DT[FALSE])
test(1095.2, DT[!TRUE, which=TRUE], DT[FALSE, which=TRUE])
######### incremented tests by 1 as I've used 1096 for FR #2077 (above along with already existing tests 522): ###########
# roll backwards when i is keyed and rollends=FALSE
# http://stackoverflow.com/questions/18984179/roll-data-table-with-rollends
dt1 = data.table(Date=as.Date(c("2013-01-03","2013-01-07")),key="Date")[,ind:=.I]
dt2 = data.table(Date=seq(from=as.Date("2013-01-01"),to=as.Date("2013-01-10"), by="1 day"),key="Date")
test(1097, dt1[dt2,roll=-Inf,rollends=FALSE]$ind, INT(NA,NA,1,2,2,2,2,NA,NA,NA)) # now ok
test(1098, dt1[dt2,roll=-Inf,rollends=TRUE]$ind, INT(1,1,1,2,2,2,2,2,2,2)) # ok before
test(1099, dt1[dt2,roll=-Inf,rollends=c(TRUE,FALSE)]$ind, INT(1,1,1,2,2,2,2,NA,NA,NA)) # ok before
test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2,2,2)) # now ok
#########################################
# All dcast.data.table tests go in here #
#########################################
{
# On 14 March 2018, as part of pruning long tests PR#2671, Matt replaced the y value of these tests with the known result. Before that, the tests
# coerced to data.frame and compared to the result of calling the reshape2::dcast data.frame method.
# Now that we have split completely from reshape2 (which is deprecated), we no longer
# want to compare output for consistency to a formally unrelated package
# We run these routinely, in dev by cc(), on Travis (coverage) and on CRAN
# Just this first test in an extra one originally for #825 to test reshape::cast. Matt retained the test and tweaked it to test dcast
# instead (which retains the Date class, unlike reshape::cast it seems).
DT = data.table(
ID = c(611557L, 611557L, 611557L, 894125L, 894125L, 894125L, 894125L, 894125L, 898856L, 898856L, 898856L, 898856L, 898856L, 898856L, 898899L, 898899L, 898899L),
DATUM = structure(c(16101, 16071, 16261, 16104, 16133, 16167, 16201, 16236, 16089, 16118, 16147, 16176, 16236, 16208, 16163, 16125, 16209), class = "Date"),
N = c(25L, 9L, 23L, 29L, 26L, 26L, 27L, 28L, 39L, 39L, 38L, 36L, 40L, 39L, 19L, 20L, 19L), rank = c(2, 1, 3, 1, 2, 3, 4, 5, 1, 2, 3, 4, 6, 5, 2, 1, 3))
test(1101, dcast(DT, ID ~ rank, value.var = "DATUM"), data.table(
ID = c(611557L, 894125L, 898856L, 898899L),
"1" = as.Date(c("2014-01-01", "2014-02-03", "2014-01-19", "2014-02-24")),
"2" = as.Date(c("2014-01-31", "2014-03-04", "2014-02-17", "2014-04-03")),
"3" = as.Date(c("2014-07-10", "2014-04-07", "2014-03-18", "2014-05-19")),
"4" = as.Date(c(NA, "2014-05-11", "2014-04-16", NA)),
"5" = as.Date(c(NA, "2014-06-15", "2014-05-18", NA)),
"6" = as.Date(c(NA, NA, "2014-06-15", NA)), key="ID"))
names(ChickWeight) <- tolower(names(ChickWeight))
DT = melt(as.data.table(ChickWeight), id.vars=2:4) # calls melt.data.table
# changed 'mean' to 'sum' to avoid valgrind floating point precision based error.
test(1102.01, dcast(DT, time ~ variable, fun.aggregate=sum)[c(1,2,11,.N)], data.table(time=c(0,2,20,21),weight=c(2053,2461,9647,9841), key="time"))
test(1102.02, dcast(DT, diet ~ variable, fun.aggregate=sum), data.table(diet=factor(1:4), weight=c(22582, 14714, 17154, 15961), key="diet"))
test(1102.03, dcast(DT, diet+chick ~ time, drop=FALSE)[c(1,.N),c(1:4,13:14)],
ans<-data.table(diet=factor(c(1,4)), chick=ordered(c(18,48),levels=levels(DT$chick)), "0"=39, "2"=c(35,50), "20"=c(NA,303), "21"=c(NA,322), key="diet,chick"))
test(1102.04, dcast(DT, diet+chick ~ time, drop=FALSE, fill=0)[c(1,.N),c(1:4,13:14)], ans[1, c("20","21"):=0])
# add test for 'subset=' in dcast
test(1102.05, dcast(DT, time + chick ~ variable+diet, fun.aggregate=sum, subset=.(time> 20))[c(1,2,44,.N)],
data.table(time=21, chick=ordered(c(13,9,42,48), levels=levels(DT$chick)), weight_1=c(96,98,0,0), weight_2=0, weight_3=0, weight_4=c(0,0,281,322), key="time,chick"))
# testing without aggregation
set.seed(3)
DT = data.table(a=5:1, b=runif(5))
ans = dcast(DT, a ~ b, value.var="b")[c(4,.N), c(2,6)]
setnames(ans, substring(names(ans),1,6))
test(1102.06, ans, data.table("0.1680"=c(NA,DT[1,b]), "0.8075"=c(DT[2,b],NA)))
# Fix for case 2 in bug report #71 - dcast didn't aggregate properly when formula RHS has "."
set.seed(45)
DT = data.table(x=rep(1:5, each=3), y=runif(15, 0, 1))
test(1102.07, dcast(DT, x ~ ., mean, value.var="y")[,`.`:=as.integer(`.`*10000)], data.table(x=1:5, "."=INT(3972,3427,3224,4182,3994), key="x"))
# also quashed another bug with `.` in formula (when there's no aggregate function):
DT = data.table(a=sample(5), b=runif(5), c=5:1)
test(1102.08, dcast(DT, a ~ ., value.var="c"), data.table(a=1:5,"."=INT(3,1,5,4,2), key="a"))
test(1102.09, dcast(DT, b+a~., value.var="c")[,b:=as.integer(b*1000)][], data.table(b=INT(129,319,585,662,891), a=INT(3,4,1,5,2), "."=5:1))
# more tests for `dcast` with formula being character and errors when formula is a hybrid
set.seed(1)
DT = data.table(a=rep(1:5, each=5), b=runif(25))
test(1102.11, dcast(DT, " a~ . ", value.var="b", fun.aggregate=length), data.table(a=1:5, `.`=5L, key="a"))
test(1102.12, dcast(DT, "a ~ c ", value.var="b"), error="not found or of unknown type")
test(1102.13, dcast(DT, a ~ a, value.var="c"), error="are not found in 'data'")
# fix for #47 - issue when factor columns on formula LHS along with `drop=FALSE`
set.seed(1L)
DT = data.table(a=factor(sample(letters[1:3], 10, replace=TRUE), letters[1:5]),
b=factor(sample(tail(letters, 5), 10, replace=TRUE)))
test(1102.14, dcast(DT, a~b, drop=FALSE, fun.aggregate=length, value.var="b"),
data.table(a=factor(letters[1:5]), v=INT(0,1,0,0,0), w=INT(1,1,1,0,0), x=INT(0,0,1,0,0), y=INT(2,1,1,0,0), z=INT(0,1,0,0,0), key="a"))
# reverse the levels
set.seed(1L)
DT = data.table(a=factor(sample(letters[1:3], 10, replace=TRUE), letters[5:1]),
b=factor(sample(tail(letters, 5), 10, replace=TRUE)))
test(1102.15, dcast(DT, a~b, drop=FALSE, value.var="b", fun.aggregate=length),
data.table(a=factor(c("e","d","c","b","a"),levels=levels(DT$a)), v=INT(0,0,0,1,0), w=INT(0,0,1,1,1), x=INT(0,0,1,0,0), y=INT(0,0,1,1,2), z=INT(0,0,0,1,0), key="a"))
# more factor cols
set.seed(1L)
DT = data.table(a1=factor(sample(letters[1:3], 10, replace=TRUE), letters[1:5]), # factor col 1
a2=factor(sample(letters[6:10], 10, replace=TRUE), letters[6:10]), # factor col 2
a3=sample(letters[1:3], 10, TRUE), # no factor
b=factor(sample(tail(letters, 5), 10, replace=TRUE)))
test(1102.16, dcast(DT, a1+a2+a3~b, drop=FALSE, value.var="b")[c(1,21,.N)],
data.table(a1=factor(c("a","b","e"),levels=letters[1:5]),
a2=factor(c("f","g","j"), levels=letters[6:10]),
a3=c("a","c","c"),
v=factor(NA, levels=tail(letters,5)),
x=factor(NA, levels=tail(letters,5)),
y=factor(c(NA,"y",NA), levels=tail(letters,5)),
z=factor(NA, levels=tail(letters,5)), key="a1,a2,a3"))
# dcast bug fix for 'subset' argument (it doesn't get key set before to run C-fcast):
DT = data.table(x=c(1,1,1,2,2,2,1,1), y=c(1,2,3,1,2,1,1,2), z=c(1,2,3,NA,4,5,NA,NA))
test(1102.17, dcast(DT, x~y, value.var="z", subset=.(!is.na(z))), data.table(x=c(1,2), `1`=c(1,5), `2`=c(2,4), `3`=c(3,NA), key="x"))
# FR #335 and DOC #332
set.seed(1L)
DT = data.table(a=sample(10), b=2013:2014, variable=rep(c("c", "d"), each=10), value=runif(20))
test(1102.18, names(dcast(DT, a ~ ... + b, value.var="value")), c("a", "c_2013", "c_2014", "d_2013", "d_2014"))
# bug git #693 - dcast error message improvement:
DT = data.table(x=c(1,1), y=c(2,2), z = 3:4)
test(1102.19, dcast(DT, x ~ y, value.var="z", fun.aggregate=identity), error="should take vector inputs and return a single value")
# bug #688 - preserving attributes
DT = data.table(id = c(1,1,2,2), ty = c("a","b","a","b"), da = as.Date("2014-06-20"))
test(1102.20, dcast(DT, formula = id ~ ty, value.var="da"), data.table(id=c(1,2), a=as.Date("2014-06-20"), b=as.Date("2014-06-20"), key="id"))
# issues/713 - dcast and fun.aggregate
DT = data.table(id=rep(1:2, c(3,4)), k=c(rep(letters[1:3], 2), 'c'), v=1:7)
foo = function (tbl, fun.aggregate) {
dcast(tbl, id ~ k, value.var='v', fun.aggregate=fun.aggregate, fill=NA_integer_)
}
test(1102.22, foo(DT, last), dcast(DT, id ~ k, value.var='v', fun.aggregate=last, fill=NA_integer_))
# more minor changes to dcast (subset argument handling symbol - removing any surprises with data.table's typical scoping rules) - test for that.
DT = data.table(id=rep(1:2, c(3,4)), k=c(rep(letters[1:3], 2), 'c'), v=1:7)
bla = c(TRUE, rep(FALSE, 6L))
# calling `subset=.(bla)` gives eval error when testing... not sure what's happeing! using values directly instead for now.
test(1102.23, dcast(DT, id ~ k, value.var="v", subset=.(c(TRUE, rep(FALSE, 6L)))), dcast(DT[1L], id ~ k, value.var="v"))
DT[, bla := !bla]
test(1102.24, dcast(DT, id ~ k, value.var="v", subset=.(bla), fun.aggregate=length), dcast(DT[(bla)], id ~ k, value.var="v", fun.aggregate=length))
# issues/715
DT = data.table(id=rep(1:2, c(3,2)), k=c(letters[1:3], letters[1:2]), v=1:5)
test(1102.25, dcast(DT, id ~ k, fun.aggregate=last, value.var="v"), error="should take vector inputs and return a single value")
test(1102.26, dcast(DT, id ~ k, fun.aggregate=last, value.var="v", fill=NA_integer_), data.table(id=1:2, a=c(1L, 4L), b=c(2L,5L), c=c(3L,NA_integer_), key="id"))
# Fix for #893
DT = data.table(
x = factor("a", levels = c("a", "b")),
y = factor("b", levels = c("a", "b")),
z = 1
)
test(1102.27, dcast(DT, y ~ x, drop = FALSE, value.var="z"),
data.table(y=factor(c("a","b")), a=c(NA,1), b=c(NA_real_,NA), key="y"))
# dcast.data.table new tests
# Fix for #1070 (special case of ... on LHS)
DT = data.table(label= month.abb[1:5], val=0)
test(1102.28, dcast(DT,... ~ label, value.var="val", sum),
data.table(`.`=".", Apr=0, Feb=0, Jan=0, Mar=0, May=0, key="."))
# Fix for #862 (optional prefixes)
DT = data.table(name=c("Betty","Joe","Frank","Wendy","Sally"),
address=c(rep("bla1",2), rep("bla2",2), "bla3"))
test(1102.29, dcast(DT, address ~ paste("cust", DT[, seq_len(.N), by=address]$V1, sep=""), value.var="name"),
data.table(address=paste("bla",1:3,sep=""), cust1=c("Betty", "Frank", "Sally"), cust2=c("Joe", "Wendy", NA), key="address"))
# Fix for #1037 (optional prefixes + undefined variables)
DT = data.table(V1 = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 3L, 4L),
V2 = c(1.052, 0.542, 0.496, 0.402, 0.278, 5.115, 4.329, 4.121, 4.075, 4.0088))
test(1102.30, dcast(DT, cumsum(V1 == 0) ~ V1, value.var = 'V2')[,lapply(.SD,function(x)as.integer(x*1000))],
data.table(V1=INT(1000,2000), "0"=INT(1052,5115), "1"=INT(542,4329), "2"=INT(496,4121), "3"=INT(402,4075), "4"=INT(278,4008), key="V1"))
# Implement #716 and #739 (multiple value.var and fun.aggregate)
# multiple value.var
set.seed(1)
DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE),
z=sample(letters[1:2], 20,TRUE), d1 = runif(20), d2=1L)
test(1102.31, dcast(DT, x + y ~ z, fun.aggregate=sum, value.var=c("d1","d2"))[c(1,.N)][, 3:4:=lapply(.SD,round,4), .SDcols=c("d1_a","d1_b")][],
data.table(x=INT(1,5), y=INT(1,1), d1_a=c(0.0,0.4785), d1_b=c(0.8753,0.9804), d2_a=INT(0,1), d2_b=INT(1,3), key="x,y"))
# multiple fun.agg
test(1102.32, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var="d1")[c(1,.N)][, 3:6:=lapply(.SD,round,3), .SDcols=3:6][],
data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327), key="x,y"))
# multiple fun.agg and value.var (all combinations)
test(1102.33, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var=c("d1", "d2"))[c(1,.N)][, c(3,4,7:10):=lapply(.SD,round,3), .SDcols=c(3,4,7:10)][],
data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_sum_a=INT(0,1),d2_sum_b=INT(1,3),
d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key="x,y"))
# multiple fun.agg and value.var (one-to-one)
test(1102.34, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var=list("d1", "d2"))[c(1,.N)][, 3:4:=lapply(.SD,round,3), .SDcols=3:4][],
data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key="x,y"))
# Additional test after fixing fun.agg creation - using the example here: https://github.com/Rdatatable/data.table/issues/716
DT = data.table(x=1:5, y=paste("v", 1:5, sep=""), v1=6:10, v2=11:15, k1=letters[1:5], k2=letters[6:10])
DT.m = melt(DT, id.vars=1:2, measure.vars=list(3:4, 5:6))
test(1102.35, dcast(DT.m, x ~ y, fun.aggregate = list(sum, function(x) paste(x, collapse="")), value.var=list("value1", "value2")),
data.table(x=1:5, value1_sum_v1=INT(17,0,0,0,0), value1_sum_v2=INT(0,19,0,0,0), value1_sum_v3=INT(0,0,21,0,0),
value1_sum_v4=INT(0,0,0,23,0), value1_sum_v5=INT(0,0,0,0,25), value2_function_v1=c("af","","","",""),
value2_function_v2=c("","bg","","",""), value2_function_v3=c("","","ch","",""), value2_function_v4=c("","","","di",""),
value2_function_v5=c("","","","","ej"), key="x"))
# more testing on fun.aggregate
DT = as.data.table(airquality)
ans = suppressWarnings(melt(DT, id=c("Month", "Day"), na.rm=TRUE)) # warning regards coercion to double
ans = ans[ , .(min=min(value), max=max(value)), by=.(Month, variable)]
ans = melt(ans, id.vars=1:2, variable.name="variable2")
ans = dcast(ans, Month ~ variable + variable2)
setnames(ans, c("Month", paste(names(ans)[-1L], sep="_")))
valvars = c("Ozone", "Solar.R", "Wind", "Temp")
ans2 = suppressWarnings(dcast(DT, Month ~ ., fun.aggregate=list(min, max), na.rm=TRUE, value.var=valvars))
setcolorder(ans, names(ans2))
test(1102.36, key(ans), "Month")
test(1102.37, ans, ans2[, names(ans2)[-1L] := lapply(.SD, as.numeric), .SDcols=-1L])
# test for #1210, sep argument for dcast
DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2],20,TRUE), d1=runif(20), d2=1L)
test(1102.38, names(dcast(DT, x ~ y + z, fun.aggregate=length, value.var = "d2", sep=".")),
c("x", "1.a", "1.b", "2.a", "2.b"))
}
# test for freading commands
x1 <- data.table(a = c(1:5), b = c(1:5))
f <- tempfile()
write.csv(x1, f, row.names = FALSE)
if (.Platform$OS.type == "unix") {
gl = identical(Sys.getenv("CI_SERVER_NAME"), "GitLab CI")
if(gl){
# skip test which fails in CI, data.table#1506
x2 = try(fread(paste('grep -v 3 ', f, sep="")), silent = TRUE)
if(is.data.table(x2)) test(1105.1, x1[a != 3], x2)
} else {
test(1105.2, x1[a != 3], fread(cmd=paste('grep -v 3 ', f, sep="")))
}
} else {
# x2 <- fread(paste('more ', f, sep=""))
# Doesn't work on winbuilder. Relies on 'more' available in DOS via Cygwin?
# Error:
# Syntax error: end of file unexpected (expecting ")")
# Error: (converted from warning) running command 'sh.exe -c (more D:\temp\RtmpgB8D2P\file1ed828a511cd) > D:\temp\RtmpgB8D2P\file1ed84f9f44f8' had status 2
# test(1105, x1, x2)
}
unlink(f)
# test for "key" argument of [.data.table
#x1 <- data.table(a = c(1:5), b = c(5:1))
#x1[J(2), key = 'a']
#test(1106, key(x1) == 'a')
#x1[, a, key = NULL]
#test(1107, is.null(key(x1)))
# test that eval works inside expressions
DT <- data.table(a = c(1:5))
s <- quote(a)
test(1108, DT[, sum(eval(s))], DT[, sum(a)])
# test that boolean expression does not trigger a not-join
DT <- data.table(a = 1:3, b = c(TRUE,FALSE,NA))
test(1109, DT[b != TRUE], DT[!(b == TRUE)])
# test that a column named list is ok (this also affects other functions in by, might be worth adding a test for that)
DT <- data.table(list = 1:6, a = 1:2)
test(1111, DT[, lapply(.SD, sum), by = a], DT[, list(list = sum(list)), by = a])
# fix for #89. "rbind" retains key when the first argument isn't a data.table (.rbind.data.table is never run is the issue)
DT <- data.table(name=c('Guff','Aw'),id=101:102,id2=1:2,key='id')
y <- rbind(list('No','NON',0L),DT,list('Extra','XTR',3L))
test(1112, key(y), NULL)
# fix for http://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by
# where, .SD loses class information.
format.myclass <- function(x, ...){
paste("!!", x, "!!", sep = "")
}
DT <- data.table(L = rep(letters[1:3],3), N = 1:9)
setattr(DT$N, "class", "myclass")
test(1113, class(DT[, .SD, by = L]$N), class(DT$N))
setkey(DT, L)
test(1114, class(DT[, .SD, by = L]$N), class(DT$N))
test(1115, class(DT[J(unique(L)), .SD, by=.EACHI]$N), class(DT$N))
# Fix for #90 - not-join quoted expression didn't work...
dt = data.table(a = 1:2, key = 'a')
dt1 = data.table(a = 1)
expr = quote(!dt1)
test(1116, dt[eval(expr)], dt[2])
expr = quote(!1)
test(1117, dt[eval(expr)], dt[2])
# Fix for #2381 - optimisation of `DT[, lapply(.SD, function(x) FUN(x, bla)), by=key(DT)]` where "bla" is a column in DT dint work.
set.seed(45)
dt <- data.table(x=rep(1:4, each=4), b1=sample(16), b2=runif(16))
setkey(dt, x)
test(1118, dt[, lapply(.SD, function(y) weighted.mean(y, b2, na.rm=TRUE)), by=x], dt[, lapply(.SD, weighted.mean, b2, na.rm=TRUE), by=x])
# a(nother) test of #295
DT <- data.table(x=5:1, y=1:5, key="y")
test(1119, is.null(key(DT[, list(z = y, y = 1/y)])))
## various ordered factor rbind tests
DT1 = data.table(ordered('a', levels = c('a','b','c')))
DT2 = data.table(factor('a', levels = c('b','a','f')))
DT3 = data.table(ordered('b', levels = c('b','d','c')))
DT4 = data.table(c('foo', 'bar'))
DT5 = data.table(ordered('b', levels = c('b','a')))
test(1120.1, rbind(DT1, DT2, DT3, DT4), ans<-data.table(factor(c('a','a','b','foo','bar'), levels = c('a','b','c','f','d', 'foo', 'bar'))),
warning=w<-"Column 1 of item 3.*level 2 [[]'d'[]] is missing from the ordered levels from column 1 of item 1.*regular factor")
test(1120.2, rbindlist(list(DT1, DT2, DT3, DT4)), ans, warning=w)
test(1121.1, rbind(DT1, DT5), ans<-data.table(factor(c('a','b'), levels = c('a','b','c'))), warning=w<-"'b'<'a'.*But 'a'<'b'.*regular factor")
test(1121.2, rbindlist(list(DT1, DT5)), ans, warning=w)
test(1122.1, rbind(DT2, DT2), data.table(factor(c('a','a'), levels = c('b','a','f'))))
test(1122.2, rbindlist(list(DT2, DT2)), data.table(factor(c('a','a'), levels = c('b','a','f'))))
test(1123.1, rbind(DT2,DT5), data.table(ordered(c('a','b'), levels=c('b','a','f'))))
test(1123.2, rbind(DT5,DT2), data.table(ordered(c('b','a'), levels=c('b','a','f'))))
# Old test to cover pre-PR#3455 rbindlist.c:289, #2346 (hashing CHARSXP no longer done)
set.seed(1)
manyChars = paste0("id",sample(99999,10000))
DT1 = data.table(ordered(sample(manyChars, 1000), levels=sample(manyChars)))
DT2 = data.table(factor(sample(manyChars, 1000)))
test(1125, rbindlist(list(DT1,DT2))[c(1,2,.N-1,.N),as.character(V1)], c("id85645","id80957","id73436","id33445"))
## test rbind(..., fill = TRUE)
DT = data.table(a = 1:2, b = 1:2)
DT1 = data.table(a = 3:4, c = 1:2)
test(1126, rbind(DT, DT1, fill = TRUE), data.table(a = 1:4, b = c(1:2, NA, NA), c = c(NA, NA, 1:2)))
## check for #95 - rbind'ing empty data.table's
DT = data.table(a=character())
#test(1127, rbind(DT, DT), DT)
## check for #88
DT = data.table(a=0:2,b=3:5,key="a")
test(1128, DT[, (function(){b})()], DT[, b])
## Fix for FR #358
DT <- data.table(x=1:5, y=6:10)
test(1129.1, DT[, as.factor(c("x", "y")), with=FALSE], DT)
test(1129.2, DT[, as.factor(c("x", "x")), with=FALSE], DT[, list(x, x)])
# Fix for a specific case that results in error in `construct` function in data.table.R (found and fixed during #87 bug fix)
MyValueIsTen <- 10
set.seed(1)
DT <- data.table(ID=sample(LETTERS[1:3], 6, TRUE), Value1=rnorm(6), Value2=runif(6))
cols <- c("Value1", "Value2")
DT2 <- copy(DT)
test(1130, DT[, (cols) := lapply(.SD, function(x) MyValueIsTen), by=ID], DT2[, (cols) := 10])
# Fix for #87 - The value MyValueIsTen = 10 was never recognised (value within the function environment)
MyValueIsTen <- 5
set.seed(1)
DT <- data.table(ID=sample(LETTERS[1:3], 6, TRUE), Value1=rnorm(6), Value2=runif(6))
My_Fun <- function(x=copy(DT)) {
MyValueIsTen <- 10
cols <- c("Value1", "Value2")
x[, (cols) := lapply(.SD, function(x) MyValueIsTen), by=ID]
}
DT[, (cols) := 10]
test(1131, My_Fun(), DT)
# Test for #96 - where `j` doesn't know `.N` when used with `lapply(.SD, function(x) ...)`
test(1132, DT[, lapply(.SD, function(x) .N), by=ID], data.table(ID=c("A", "B", "C"), Value1=2L, Value2=2L))
# Test for #91 - `:=` recycle error during 'by'
DT <- data.table(x=INT(1,1,1,1,1,2,2))
# on a new column
test(1133.1, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'")
test(1133.2, DT, data.table(x=INT(1,1,1,1,1,2,2)))
# on an already existing column:
DT[, new:=99L]
test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'")
test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'")
test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L))
test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2)))
test(1133.7, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2)))
# on a new column with warning on 2nd assign
DT[,new:=NULL]
test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)),
warning="Group 2 column 'new': 3.4.*double.*at RHS position 1 truncated.*precision lost.*integer")
# Fix for FR #2496 - catch `{` in `:=` expression in `j`:
DT <- data.table(x=c("A", "A", "B", "B"), val =1:4)
DT2 <- copy(DT)[, a := 1L]
test(1134.1, DT[, {a := 1L}], DT2)
test(1134.2, DT[, {a := 1L; NULL}], error="You have wrapped.*which is ok.*Consider")
test(1134.3, DT[, {b := 2L}, by=x], DT2[, b:=2L, by=x])
test(1134.4, DT[, {b := 2L; sum(val)}, by=x], error="You have wrapped.*which is ok.*Consider")
# FR #2693 and Gabor's suggestions on datatable-help "Problem with FAQ 2.8"
d1 <- data.table(id1 = c(1L, 2L, 2L, 3L), val = 1:4, key = "id1")
d2 <- data.table(id2 = c(1L, 2L, 4L), val2 = c(11, 12, 14),key = "id2")
d3 <- copy(d2)
setnames(d3, names(d1))
test(1136.01, d1[d2, id1], INT(1,2,2,4))
test(1136.02, d1[d2, id1], d1[d2][,id1])
test(1136.03, d1[d2, id2], INT(1,2,2,4))
test(1136.04, d1[d2, id2], d1[d2, list(id1,id2,val,val2)][,id2])
test(1136.05, d1[d3, i.id1], INT(1,2,2,4))
test(1136.06, d1[d3, i.id1], d1[d3, list(id1,i.id1)][,i.id1])
test(1136.07, d1[d2, val], c(1:3, NA))
test(1136.08, d1[d2, val2], c(11,12,12,14))
test(1136.09, d1[d3, list(id1, val, i.val)], data.table(id1=INT(1,2,2,4), val=c(1:3, NA), i.val=c(11,12,12,14), key="id1"))
test(1136.10, d1[d3, list(id1, i.id1, val, i.val)], data.table(id1=INT(1,2,2,4),
i.id1=INT(1,2,2,4), val=c(1:3, NA), i.val=c(11,12,12,14), key="id1"))
test(1136.11, d1[d2], data.table(id1=INT(1,2,2,4), val=c(1:3, NA), val2=c(11,12,12,14), key="id1"))
test(1136.12, d1[J(2), id1], INT(2,2))
test(1136.13, d1[J(2), i.id1], error="not found")
DT <- data.table(x=c("A", "A", "C", "C"), y=1:4, key="x")
test(1136.14, DT["C", i.x], error="not found")
# test for FR #355
DT <- data.table(x=1:5, y=6:10, z=11:15)
test(1137.01, DT[, .SD, .SDcols=-1L], DT[, 2:3, with=FALSE])
test(1137.02, DT[, .SD, .SDcols=-(1:2)], DT[, 3, with=FALSE])
test(1137.03, DT[, .SD, .SDcols=-"y"], DT[, c(1,3), with=FALSE])
test(1137.04, DT[, .SD, .SDcols=-c("y", "x")], DT[, 3, with=FALSE])
test(1137.05, DT[, .SD, .SDcols=-which(names(DT) %in% c("x", "y", "z"))], null.data.table())
test(1137.06, DT[, .SD, .SDcols=c(1, -2)], error=".SDcols is numeric but has both")
test(1137.07, DT[, .SD, .SDcols=c("x", -"y")], error="invalid argument to unary")
test(1137.08, DT[, .SD, .SDcols=c(-1, "x")], error="Some items of .SDcols are")
DT <- data.table(x=1:5, y=6:10, z=11:15, zz=letters[1:5])
test(1137.09, DT[, .SD, .SDcols=-grep("^z", names(DT))], DT[, 1:2, with=FALSE])
test(1137.10, DT[, .SD, .SDcols=-grep("^z", names(DT), value=TRUE)], DT[, 1:2, with=FALSE])
test(1137.11, DT[, .SD, .SDcols=-grep("^z", names(DT), value=TRUE, invert=TRUE)], DT[, 3:4, with=FALSE])
set.seed(45)
DT = data.table(x=c("A", "A", "C", "C"), y=1:4, z=runif(4))
test(1137.12, DT[, lapply(.SD, sum), by=x, .SDcols=-"y"], DT[, lapply(.SD, sum), by=x, .SDcols="z"])
# test for FR #353 / R-Forge #353 - print.data.table gets new argument "row.names", default=TRUE. if FALSE, the row-names don't get printed
# Thanks to Eddi for `capture.output` function!
DT <- data.table(x=1:5, y=6:10)
test(1138.1, capture.output(print(DT, row.names=FALSE)), c(" x y", " 1 6", " 2 7", " 3 8", " 4 9", " 5 10"))
DT <- data.table(x=1:101, y=6:106) # bug described in #1307
test(1138.2, capture.output(print(DT, row.names=FALSE)), c(" x y", " 1 6", " 2 7", " 3 8", " 4 9", " 5 10", "--- ", " 97 102", " 98 103", " 99 104", " 100 105", " 101 106"))
# test for FR #2591 (format.data.table issue with column of class "formula")
DT <- data.table(x=c(a~b, c~d+e), y=1:2)
test(1139, capture.output(print(DT)), c(" x y", "1: a ~ b 1", "2: c ~ d + e 2"))
# FR #362 - provide warnings if there are remainders for both as.data.table.list(.) and data.table(.)
X = list(a = 1:2, b = 1:3)
test(1140, as.data.table(X), data.table(a=c(1:2,1L), b=c(1:3)), warning="Item 1 has 2 rows but longest item has 3; recycled")
test(1141.1, data.table(a=1:2, b=1:3), data.table(a=c(1L,2L,1L), b=1:3), warning="Item 1 has 2 rows but longest item has 3; recycled")
test(1141.2, data.table(a=1:2, data.table(x=1:5, y=6:10)), data.table(a=c(1L,2L,1L,2L,1L), x=1:5, y=6:10), warning="Item 1 has 2 rows but longest item has 5; recycled")
test(1141.3, data.table(a=1:5, data.table(x=c(1,2), y=c(3,4))), data.table(a=c(1:5), x=c(1,2,1,2,1), y=c(3,4,3,4,3)), warning="Item 2 has 2 rows but longest item has 5; recycled")
# Fix for bug #79 - DT[, foo()] returns function definition.
DT <- data.table(a=1:2)
foo <- function() sum(1:5)
test(1142, DT[, foo()], 15L)
# Fix for bug #77 - side-effect of fixing #2531 - `:=` with grouping (by) and assigning factor columns
DT <- data.table(x=c(1,1,1,2,2), y=factor(letters[1:5]))
test(1143.1, DT[, z := y, by=x], data.table(x=c(1,1,1,2,2), y=factor(letters[1:5]), z=factor(letters[1:5])))
# Added 3 more tests to close bug #36 - partial regression due to recent changes (in 1.9.2)
# This should catch any attributes being lost hereafter.
DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2))
test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - ")))
DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L))
test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x))
ans = copy(DT)
test(1143.4, DT[,`:=`(lx=tail(x,1L)), by=y], ans[, lx := x])
# FR #2356 - retain names of named vector as column with keep.rownames=TRUE
x <- 1:5
setattr(x, 'names', letters[1:5])
test(1144.1, as.data.table(x, keep.rownames=TRUE), data.table(rn=names(x), x=unname(x)))
x <- as.numeric(x)
setattr(x, 'names', letters[1:5])
test(1144.2, as.data.table(x, keep.rownames=TRUE), data.table(rn=names(x), x=unname(x)))
x <- as.character(x)
setattr(x, 'names', letters[1:5])
test(1144.3, as.data.table(x, keep.rownames=TRUE), data.table(rn=names(x), x=unname(x)))
x <- as.factor(x)
setattr(x, 'names', letters[1:5])
test(1144.4, as.data.table(x, keep.rownames=TRUE), data.table(rn=names(x), x=unname(x)))
x <- as.Date(1:5, origin="2013-01-01")
setattr(x, 'names', letters[1:5])
test(1144.5, as.data.table(x, keep.rownames=TRUE), data.table(rn=names(x), x=unname(x)))
# Fix for bug #75 - .data.table.locked ISSUE
DT <- data.table(x=1:5, y=6:10)
xx <- DT[, .SD, .SDcols="y"]
test(1145, xx[, y := as.numeric(y)], data.table(y = as.numeric(6:10)))
# Fix for bug #74 - set not adding columns on class that builds on data.table
DT <- as.data.table(BOD)
ans = copy(DT)[, Time := as.numeric(Time)]
setattr(DT, "class", c("myclass", class(DT)))
setattr(ans, 'class', class(DT))
test(1146.1, DT[, Time:= as.numeric(Time)], ans)
DF <- as.data.frame(DT)
test(1146.2, {set(DF, i=NULL, j=1L, value=seq_len(nrow(DF)));setattr(DF,"reference",NULL);DF}, data.frame(Time=1:nrow(BOD), demand=BOD$demand))
test(1146.3, set(DF, i=NULL, j="bla", value=seq_len(nrow(DF))), error="set() on a data.frame is for changing existing columns, not adding new ones. Please use a data.table for that.")
if (test_longdouble) {
# e.g. not on CRAN's solaris-sparc 32bit, and not under valgrind which uses 53 instead of 64 longdouble.digits
old = getNumericRounding()
set.seed(6)
x = rnorm(1e6)*1e4
ans = base::sort.list(x, method="shell")
setNumericRounding(0)
test(1147.1, ans, forderv(x))
setNumericRounding(1)
test(1147.2, ans, forderv(x))
setNumericRounding(2)
test(1147.3, sum(ans != forderv(x)), 2L)
tol = 3.000214e-13
x = c(8, NaN, Inf, -7.18918, 5.18909+0.07*tol, NA, -7.18918111, -Inf, NA, 5.18909, NaN, 5.18909-1.2*tol, 5.18909-0.04*tol)
test(1147.4, binary(x[c(5,10,12,13)]),
c("0 10000000001 010011000001101000001100111100011000 00000000 11000000",
"0 10000000001 010011000001101000001100111100011000 00000000 10101000",
"0 10000000001 010011000001101000001100111100010111 11111111 00010011",
"0 10000000001 010011000001101000001100111100011000 00000000 10011010"))
setNumericRounding(0)
test(1147.5, forderv(x), INT(6, 9, 2, 11, 8, 7, 4, 12, 13, 10, 5, 1, 3))
setNumericRounding(1)
test(1147.6, forderv(x), INT(6, 9, 2, 11, 8, 7, 4, 12, 5, 10, 13, 1, 3))
setNumericRounding(2)
test(1147.7, forderv(x), INT(6, 9, 2, 11, 8, 7, 4, 5, 10, 12, 13, 1, 3))
# rounds item 12 at bit 48 doesn't just truncate
setNumericRounding(old)
}
test(1149.1, forderv(integer(0)), integer(0))
test(1149.2, forderv(numeric(0)), integer(0))
# test uniqlengths
set.seed(45)
x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE)
ox <- forderv(x)
o1 <- uniqlist(list(x), ox)
test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
o1 <- uniqlist(list(x))
test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
rm(list=c("x","ox","o1"))
gc()
# #67 fix - grouping with .SDcols gave "symbol not subsettable error" - consequence of FR #355 implementation
dt = data.table(grp = sample(letters[1:3],20, replace = TRUE), v1 = rnorm(20), v2 = rnorm(20))
sd.cols <- "v1"
test(1152, dt[, lapply(.SD, mean), by=grp, .SDcols=sd.cols], dt[, list(v1=mean(v1)), by=grp])
# #68 fix - setattr attribute non-character led to segfault
x <- FALSE
test(1153, setattr(x, FALSE, FALSE), error="Attribute name must be")
# Fixed binary search capabilities for NA (for int and double) and NaN (for double):
set.seed(1)
DT <- data.table(x=sample(c(NA, NaN, Inf, 1:10), 100, TRUE), y=sample(c(NA, 1:10), 100, TRUE), z=sample(c(NA_character_, letters[1:10]), 100, TRUE))
setkey(DT, x)
test(1154.1, DT[J(NaN)], DT[is.nan(x)])
test(1154.2, DT[J(NA_real_)], DT[is.na(x) & !is.nan(x)])
setkey(DT, y)
test(1154.3, setcolorder(DT[J(NA_integer_)], c("x", "y", "z")), DT[is.na(y)])
setkey(DT, z)
test(1154.4, setcolorder(DT[J(NA_character_)], c("x", "y", "z")), DT[is.na(z)])
# Fixing the binary search above for NA/NaN also fixes BUG #100
dt1 <- data.table(x = c('red','orange','green'), y=c(1,2,NA), key='y')
dt2 <- data.table(y = c(1,2,3,NA), z = c('a','b','c','missing data'), key='y')
test(1155.1, merge(dt1, dt2, by=c('y')), data.table(y=dt1$y, x=dt1$x, z=dt2$z[1:3], key="y"))
test(1155.2, dt2[dt1], data.table(y=dt1$y, z=dt2$z[1:3], x=dt1$x, key="y"))
test(1155.3, dt1[dt2, nomatch=0L], data.table(x=dt1$x, y=dt1$y, z=dt2$z[1:3], key="y"))
# NaN wasn't properly searched for in some cases. Fixed that. Here's the fix!
dt <- structure(list(x = c(NaN, NaN, NaN, NaN, NaN, NA, NA, -3, -3,
-3, -2, -2, -1, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3, 3), y = c(16L,
25L, 23L, 17L, 21L, 11L, 13L, 15L, 1L, 6L, 4L, 18L, 7L, 3L, 12L,
24L, 2L, 10L, 20L, 14L, 9L, 19L, 8L, 22L, 5L)), .Names = c("x",
"y"), row.names = c(NA, -25L), class = c("data.table", "data.frame"
))
setkey(dt, x)
test(1155.4, dt[J(NaN)], dt[is.nan(x)])
test(1155.5, dt[J(NA_real_)], dt[is.na(x) & !is.nan(x)])
# Fix for (usually small) memory leak when grouping, #2648.
# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row).
DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001))
before = gc()["Vcells",2]
for (i in 1:50) DT[, sum(B), by=A]
after = gc()["Vcells",2]
test(1157, after < before+3) # +3 = 3MB
# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case.
# Similar for when dogroups writes less rows than allocated, #2648.
DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4))
before = gc()["Vcells",2]
for (i in 1:50) DT[ , unlist(.SD), by = 'k']
after = gc()["Vcells",2]
test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024
# tests for 'setDT' - convert list, DF to DT without copy
x <- data.frame(a=1:4, b=5:8)
test(1159.1, setDT(x), data.table(a=1:4, b=5:8))
x <- list(1:4, 5:8)
test(1159.2, setDT(x), data.table(1:4, 5:8))
x <- list(a=1:4, b=5:8)
test(1159.3, setDT(x), data.table(a=1:4, b=5:8))
x <- list(a=1:4, 5:8)
test(1159.4, setDT(x), setnames(data.table(1:4, 5:8), c("a", "V1")))
x <- data.table(a=1:4, b=5:8)
test(1159.5, setDT(x), data.table(a=1:4, b=5:8))
x <- 1:5
test(1159.6, setDT(x), error="Argument 'x' to 'setDT' should be a")
x <- list(1, 2:3)
test(1159.7, setDT(x), error="All elements in argument 'x' to 'setDT'")
# test 1160 was for setrev, now removed
# tests for setreordervec
# integer
x <- sample(c(-10:10, NA), 100, TRUE)
o <- base::order(x, na.last=FALSE)
y <- copy(x)
setreordervec(y, o)
test(1161.1, x[o], y)
# numeric
x <- sample(c(NA, rnorm(10)), 100, TRUE)
o <- base::order(x, na.last=FALSE)
y <- copy(x)
setreordervec(y, o)
test(1161.2, x[o], y)
# character
x <- sample(c(NA, letters), 100, TRUE)
o <- base::order(x, na.last=FALSE)
y <- copy(x)
setreordervec(y, o)
test(1161.3, x[o], y)
# tests for setreordervec
DT <- data.table(x=sample(c(NA, -10:10), 2e2, TRUE),
y=sample(c(NA, NaN, -Inf, Inf, -10:10), 2e2, TRUE),
z=sample(c(NA, letters), 2e2, TRUE))
# when not sorted, should return FALSE
test(1162.01, is.sorted(DT[[1L]]), FALSE)
setkey(DT, x)
test(1162.02, is.sorted(DT[[1L]]), TRUE)
test(1162.03, is.sorted(DT[[2L]]), FALSE)
setkey(DT, y)
test(1162.04, is.sorted(DT[[2L]]), TRUE)
test(1162.05, is.sorted(DT[[3L]]), FALSE)
setkey(DT, z)
test(1162.06, is.sorted(DT[[3L]]), TRUE)
setkey(DT, x, y)
test(1162.07, length(forderv(DT, by=1:2)), 0L)
setkey(DT, x, z)
test(1162.08, length(forderv(DT, by=c(1L, 3L))), 0L)
setkey(DT, y, z)
test(1162.09, length(forderv(DT, by=2:3)), 0L)
setkey(DT)
# test number 1162.10 skipped because if it fails it confusingly prints out as 1662.1 not 1662.10
test(1162.10, length(forderv(DT, by=1:3)), 0L)
test(1162.11, is.sorted(DT, by=1:3), TRUE)
test(1162.12, is.sorted(DT, by=2:1), FALSE)
test(1162.13, is.sorted(DT), TRUE)
DT = data.table(A=INT(1,1,2), B=c(NA,"a",NA))
test(1162.14, is.sorted(DT), TRUE)
test(1162.15, is.sorted(DT, by=c("B","A")), FALSE)
DT = data.table(A=INT(1,1,2), B=c("a",NA,NA))
test(1162.16, is.sorted(DT), FALSE)
test(1162.17, is.sorted(DT, by=2), FALSE)
if (test_bit64) {
DT[, A:=as.integer64(A)]
test(1162.18, is.sorted(DT, by="A"), TRUE) # tests the single-column special case
test(1162.19, is.sorted(DT), FALSE) # tests the 2-column case branch for integer64
DT[2, B:="b"]
test(1162.20, is.sorted(DT), TRUE)
}
utf8_strings = c("\u00a1tas", "\u00de")
latin1_strings = iconv(utf8_strings, from="UTF-8", to="latin1")
DT = data.table(A=c(utf8_strings, latin1_strings), B=1:4)
test(1162.21, is.sorted(DT), FALSE)
setkey(DT)
test(1162.22, is.sorted(DT), TRUE)
# FR #351 - last on length=0 arguments
x <- character(0)
test(1163, last(x), character(0))
# Test 1164 was a non-ASCII test, now in DtNonAsciiTests
# Bug fix for #73 - segfault when rbindlist on empty data.tables
x <- as.data.table(BOD)
y <- copy(x)
test(1165.1, attr(ans<-x[Time>100], "reference"), "A1.4, p. 270") # DT[] retains attribute as from v1.12.0
attr(ans, "reference") <- NULL
test(1165.2, ans, rbindlist(list(x[Time > 100], y[Time > 200]))) # rbindlist drops attribute (no change)
# Bug fix for the #63 - rbind(DT, NULL) should not result in error, but BOD has an attribute as well, which won't be preserved (due to C-impl). Changing test.
setattr(x <- as.data.table(BOD), 'reference', NULL)
test(1166, x, rbind(x, NULL))
# fix for bug #60 - ordering with multiple columns in which at least one of them is a logical column
foo = data.table(a=rep(c(0L,1L,0L,1L),2), b=rep(c(TRUE,TRUE,FALSE,FALSE),2), c=1L)
test(1167, foo[, .N, by=list(b,a)], data.table(b=c(TRUE, TRUE, FALSE, FALSE), a=c(0L,1L,0L,1L), N=2L))
# fix for bug #55 - rbindlist with factor columns and empty data.tables resulted in error.
A <- data.table(x=factor(1), key='x')
B <- data.table(x=factor(), key='x')
test(1168.1, rbindlist(list(B,A)), data.table(x=factor(1)))
# fix for bug #72, it's related to rbind and factors as well - more or less similar to 1168.1 (#55).
tmp1 <- as.data.table(structure(list(Year = 2013L, Maturity = structure(1L, .Label = c("<1",
"1.0 - 1.5", "1.5 - 2.0", "2.0 - 2.5", "2.5 - 3.0", "3.0 - 4.0",
"4.0 - 5.0", ">5.0"), class = "factor"), Quality = structure(2L, .Label = c(">BBB",
"BBB", "BB", "B", "CCC", "<CCC", "NR", "CASH"), class = c("ordered",
"factor")), Ct = 2L, Wt = 1.56, CtTotRet = 1.08, TotRet = 69.2307692307692), .Names = c("Year",
"Maturity", "Quality", "Ct", "Wt", "CtTotRet", "TotRet"), class = c("data.table",
"data.frame"), row.names = c(NA, -1L)))
tmp2 <- as.data.table(structure(list(Year = 2013L, Maturity = "TOTAL", Quality = "TOTAL",
Ct = 214L, Wt = 100.001, CtTotRet = 406.26, TotRet = 406.255937440626), .Names = c("Year",
"Maturity", "Quality", "Ct", "Wt", "CtTotRet", "TotRet"), class = c("data.table",
"data.frame"), row.names = c(NA, -1L)))
# "TOTAL" is added to the end of the ordered levels and ordered factor retained
test(1168.2, as.data.frame(rbind(tmp1,tmp2)), rbind(as.data.frame(tmp1), as.data.frame(tmp2)))
# checks of "" and NA_character_ ordering.
test(1169, forderv(c(NA,"","a","NA")), INT(1,2,4,3)) # data.table does ascii ordering currently, so N comes before a
test(1170, length(forderv(c(NA,"","NA","a"))), 0L)
test(1171, forderv(c("",NA,"a","NA")), INT(2,1,4,3))
test(1172, length(forderv(NA_character_)), 0L)
test(1173, length(forderv(c(NA_character_,NA_character_))), 0L)
test(1174, length(forderv(c(NA_character_,NA_character_,NA_character_))), 0L)
test(1175, length(forderv("")), 0L)
test(1176, length(forderv(c("",""))), 0L)
test(1177, length(forderv(c("","",""))), 0L)
test(1178, forderv(c("",NA,"")), INT(2,1,3))
# Test no invalid sort order warning when key is ok and 2nd colum is character/double
DT = CJ(a=rep(1:3),b=c("a","b"))
test(1179.1, key(DT), c("a","b"))
test(1179.2, setkey(DT), DT) # i.e. no warning
DT = CJ(a=rep(1:3),b=c(3.14,3.15))
test(1180.1, key(DT), c("a","b"))
test(1180.2, setkey(DT), DT) # i.e. no warning
# test for iradix (NA and negatives). Tests need large range to trigger iradix.
test(1181, forderv(INT(1,3,5000000,NA)), INT(4,1,2,3))
test(1182, forderv(INT(1,-1,5000000,NA)), INT(4,2,1,3))
test(1183, forderv(INT(-3,-7,1,-6000000,NA,3,5000000,NA,8)), INT(5,8,4,2,1,3,6,9,7))
# tests of gsum and gmean with NA
DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12))
set(DT,c(3L,8L),"y",NA)
set(DT,c(5L,9L),"v",NA)
set(DT,10:12,"y",NA)
set(DT,10:12,"v",NA)
options(datatable.optimize=1) # turn off GForce
test(1184.1, DT[, sum(v), by=x, verbose=TRUE], output="(GForce FALSE)")
test(1184.2, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)")
test(1185.1, DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x],
data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0)))
options(datatable.optimize=0) # turn off fastmean optimization to get the answer to match to
test(1185.2, ans <- DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output="All optimizations.*off")
options(datatable.optimize=1) # turn on old fastmean optimization only
test(1185.3, DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], ans, output="Old mean.*changed j")
options(datatable.optimize=Inf) # turn on GForce
test(1185.4, DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], ans, output="GForce optimized j to")
test(1186, DT[, sum(v), by=x, verbose=TRUE], output="GForce optimized j to")
test(1187.1, DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x],
data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0)))
MyVar = TRUE
test(1187.2, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output="GForce optimized j to",
DT[, list(sum(y,na.rm=TRUE), mean(y,na.rm=TRUE)), by=x])
test(1187.3, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output="GForce optimized j to",
DT[, mean(y,na.rm=TRUE), by=x])
MyVar = FALSE
test(1187.4, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output="GForce optimized j to",
DT[, list(sum(y,na.rm=FALSE), mean(y,na.rm=FALSE)), by=x])
test(1187.5, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output="GForce optimized j to",
DT[, mean(y,na.rm=FALSE), by=x])
# GForce should not turn on when the .ok function isn't triggered
test(1187.6, DT[, mean(y, trim=.2), by=x, verbose=TRUE],
data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)),
output='j unchanged', warning="'trim' is not yet optimized")
# test from Zach Mayer
a <- c("\"\"\"\") \" \" \" \" \"", "\"\"\"\") \" \" \" \" \"", "\"\"\"\") \" \" \" \" ,\"")
test(1188, forderv(a), INT(1,3,2))
# test as.ITime vectorization
# and that the 12 seconds here aren't silently chopped just because the first is formatted %H:%M
x = c("18:00", "18:00:12")
test(1189, as.character(as.ITime(x)), c("18:00:00", "18:00:12"))
# that CJ() orders in the same order as setkey, #50
DT = CJ(c("Corp", "CORP"), 1:3)
test(1190, setkey(DT), DT) # tests no warning here from setkey, was "key rebuilt" due to inconsistent locale sorting in v1.8.10
# non-exact recycling in j results. Was caught with error in v1.8.10, now recycles with remainder and warning
DT = data.table(a=1:2, b=1:6)
test(1191, DT[, list(b,1:2), by=a], error="Supplied 2 items for column 2 of group 1 which has 3 rows.")
# that twiddle is used consistently, and tolerance has gone.
# nice example from : http://stackoverflow.com/questions/21885290/data-table-roll-nearest-returns-multiple-results
x = 0.0275016249293408
DT = data.table(rnk = c(0, 0.0909090909090909, 0.181818181818182, 0.272727272727273),
val = c(0.0233775088495975, 0.0270831481152598, 0.0275016216267234, 0.0275016249293408))
# 2 byte rounding is about 11 s.f., so val[3] and val[4] are considered different
test(1192, DT[,.N,keyby=val], setkey(DT,val)[,.N,by=val]) # tests uniqlist uses twiddle
test(1193, DT[,.N,by=val]$N, INT(1,1,1,1))
test(1194, DT[.(x),.N], 1L) # tests bmerge uses twiddle
DT[3, val:=0.0275016249291963]
setkey(DT, NULL) # val[3] and val[4] are now equal, within 2 byte rounding
test(1195, DT[,.N,keyby=val], setkey(DT,val)[,.N,by=val])
old_rounding = getNumericRounding() # default is 0
test(1196.1, DT[,.N,by=val]$N, INT(1,1,1,1))
test(1196.2, DT[.(x),.N], 1L)
setNumericRounding(2L)
test(1197.1, DT[,.N,by=val]$N, INT(1,1,2))
test(1197.2, DT[.(x),.N], 2L)
setNumericRounding(old_rounding)
DT = data.table(id=INT(1,2,1), val1=3:1, val2=3:1, val3=list(2:3,4:6,7:10)) # 5380
test(1199.1, DT[, sum(.SD), by=id, .SDcols=2:3], data.table(id=1:2, V1=INT(8,4))) #875 made the .SD case work
test(1199.2, DT[, sum(.SD), by=id], error="only defined on a data frame with all numeric variables")
test(1199.3, DT[, sum(val3), by=id], error="Type 'list' not supported by GForce sum [(]gsum[)]. Either.*or turn off")
# Selection of columns, copy column to maintain the same as R <= 3.0.2, in Rdevel, for now
# Otherwise e.g. setkey changes the original columns too. TO DO: could allow shallow copy, perhaps.
DT = data.table(a=1:3, b=6:4, c=7:9)
test(1200, address(DT[,"b",with=FALSE]$b) != address(DT$b))
test(1201, address(DT[,c("b","c"),with=FALSE]$c) != address(DT$c))
test(1202, address(DT[,list(b)]$b) != address(DT$b))
test(1203, address(DT[,list(b,c)]$c) != address(DT$c))
test(1204, address(DT[1:3,"b",with=FALSE]$b) != address(DT$b))
test(1205, address(DT[TRUE,"b",with=FALSE]$b) != address(DT$b))
DT = data.table(a=6:1, b=1:2)
test(1206, DT[order(b,a)], data.table(a=INT(2,4,6,1,3,5),b=INT(1,1,1,2,2,2)))
# Test joining to Inf, -Inf and mixed non-finites, and grouping
old_rounding = getNumericRounding()
DT = data.table(A=c(1,2,-Inf,+Inf,3,-1.1,NaN,NA,3.14,NaN,2.8,NA), B=1:12, key="A")
for (i in 0:1) { # tests 1207 & 1208
setNumericRounding(if (i==0L) 0L else 2L)
test(1207+i+0.1, DT[.(c(NA_real_,Inf)),B], INT(8,12,4))
test(1207+i+0.2, DT[.(c(Inf,NA_real_)),B], INT(4,8,12))
test(1207+i+0.3, DT[.(c(NaN,NA_real_)),B], INT(7,10,8,12))
test(1207+i+0.4, DT[.(c(NA_real_,NaN)),B], INT(8,12,7,10))
test(1207+i+0.5, DT[,sum(B),by=A]$V1, INT(20,17,3,6,1,2,11,5,9,4))
test(1207+i+0.6, DT[,sum(B),by=list(g=abs(trunc(A)))], data.table(g=c(NA,NaN,Inf,1,2,3),V1=INT(20,17,7,7,13,14)))
test(1207+i+0.7, DT[,sum(B),keyby=list(g=abs(trunc(A)))], data.table(g=c(NA,NaN,1,2,3,Inf),V1=INT(20,17,7,13,14,7),key="g"))
# test(1207+i+0.8, DT[.(-200.0),roll=TRUE]$B, 3L) # TO DO: roll to -Inf. Also remove -Inf and test rolling to NaN and NA
}
setNumericRounding(old_rounding)
# that fread reads unescaped (but balanced) quotes in the middle of fields ok, #2694
test(1215,
fread('N_ID VISIT_DATE REQ_URL REQType\n175931 2013-03-08T23:40:30 http://aaa.com/rest/api2.do?api=getSetMobileSession&data={"imei":"60893ZTE-CN13cd","appkey":"android_client","content":"Z0JiRA0qPFtWM3BYVltmcx5MWF9ZS0YLdW1ydXoqPycuJS8idXdlY3R0TGBtU 2'),
data.table(N_ID=175931L, VISIT_DATE="2013-03-08T23:40:30", REQ_URL='http://aaa.com/rest/api2.do?api=getSetMobileSession&data={"imei":"60893ZTE-CN13cd","appkey":"android_client","content":"Z0JiRA0qPFtWM3BYVltmcx5MWF9ZS0YLdW1ydXoqPycuJS8idXdlY3R0TGBtU', REQType=2L)
)
test(1216.1, identical('A,B,C\n1.2,Foo"Bar,"a"b\"c"d"\nfo"o,bar,"b,az""\n', # \ before "c checked to be superfluous, an aside in #2755
'A,B,C\n1.2,Foo"Bar,"a"b"c"d"\nfo"o,bar,"b,az""\n'))
test(1216.2, fread('A,B,C\n1.2,Foo"Bar,"a"b"c"d"\nfo"o,bar,"b,az""\n'),
data.table(A = c("1.2", "fo\"o"), B = c("Foo\"Bar", "bar"), C = c("a\"b\"c\"d", "b,az\"")),
warning=w<-"resolved improper quoting")
test(1216.3, fread('A,B,C\n1.2,Foo"Bar,"a"b\\"c"d"\nfo"o,bar,"b,az""\n'),
data.table(A = c("1.2", "fo\"o"), B = c("Foo\"Bar", "bar"), C = c("a\"b\\\"c\"d", "b,az\"")),
warning=w)
test(1216.4, fread('A,B,C\n1.2,Foo"Bar,"a"b"c"d""\nfo"o,bar,"b,az""\n'),
data.table(A = c("1.2", "fo\"o"), B = c("Foo\"Bar", "bar"), C = c("a\"b\"c\"d\"", "b,az\"")),
warning=w)
test(1216.5, fread('A,B,C\n1.2,Foo"Bar,"a"b"c"d""\nfo"o,bar,"b,"az""\n'),
data.table(A=c('1.2','fo"o'), B=c('Foo"Bar','bar'),C=c('a"b"c"d"','b,"az"')),
warning=w)
test(1217, fread('"One,Two","Three",Four\n12,3,4\n56,7,8\n'), # quoted column names including the separator
data.table("One,Two"=c(12L,56L),Three=c(3L,7L),Four=c(4L,8L)))
# joining from empty character, #45
DT = data.table(a=1:3, b=c("a","b","c"), key="b")
test(1218, DT[ DT[FALSE] ], data.table(a=integer(), b=character(), i.a=integer(), key="b"))
# set() multiple columns
DT = data.table(a=1:3,b=4:6,c=7:9)
newVals = data.table(10:12,13:15)
test(1219, set(DT,j=2:3,value=newVals), data.table(a=1:3,b=10:12,c=13:15))
newVals = list(16:18,19:21)
test(1220, set(DT,j=2:3,value=newVals), data.table(a=1:3,b=16:18,c=19:21))
# Test non-join key columns used in j work again (spotted straight away by Michele on datatable-help when v1.9.2 was released).
# Introduced at commit 1030. Very extensive new tests 1136* still all pass (great stuff Arun).
DT = data.table(a=1:2,b=letters[1:6],key="a,b")
test(1221, DT[.(1),b], c("a","c","e"))
###########################################################################################
# extensive testing of forderv with decreasing order of sorting (total of >700 tests so far - without NaN/NA
###########################################################################################
# - Generate a random seed each time; the randomness allows catching errors quicker
# - But save the seed so that we can generate the same data back if any error occurs
seed = as.integer(Sys.time()) # sample(9999L, 1L) temporary fix, because all the set.seed(.) used above makes this sample() step deterministic (always seed=9107)
seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="")
# no NaN (because it's hard to match with base::order); tested below in 1988.4-8
set.seed(seed)
foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, sep="")
i1 = as.integer(sample(c(-100:100), 1e3, TRUE))
i2 = as.integer(sample(c(-100:100, -1e6, 1e6), 1e3, TRUE))
d1 = as.numeric(sample(c(-100:100,Inf,-Inf), 1e3, TRUE))
d2 = as.numeric(rnorm(1e3))
c1 = sample(c(letters), 1e3, TRUE)
c2 = sample(foo(200), 1e3, TRUE)
DT = data.table(i1, i2, d1, d2, c1, c2)
# randomise col order as well
colorder=sample(ncol(DT))
setcolorder(DT, names(DT)[colorder])
seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="")
ans = vector("list", length(names(DT)))
test_no = 1223.0
oldnfail = nfail
for (i in seq_along(names(DT))) {
cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i)))
ans[[i]] = combn(names(DT), i, function(x) {
tmp = apply(cj, 1, function(y) {
test_no <<- signif(test_no+.001, 7)
ll = as.call(c(as.name("order"),
lapply(seq_along(x), function(j) {
if (y[j] == 1L)
as.name(x[j])
else {
if (class(DT[[x[j]]]) =="character")
as.call(c(as.name("-"), as.call(list(as.name("xtfrm"), as.name(x[j])))))
else
as.call(list(as.name("-"), as.name(x[j])))
}
})
))
test(test_no, forderv(DT, by=x, order=y), with(DT, eval(ll)))
})
dim(tmp)=NULL
list(tmp)
})
}
ans = NULL
if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce
# fix for bug #44 - unique on null data.table should return null data.table
test(1224, unique(data.table(NULL)), data.table(NULL))
# forderv should return 'integer(0)' when 'x' is not atomic and of 0 length (to be consistent with base::order)
test(1225.1, forderv(list()), integer(0))
test(1225.2, forderv(data.table(NULL)), integer(0))
# fix for bug #48 - data.table(null list, data.frame, data.table) should return null data.table
test(1226.1, data.table(list()), null.data.table())
test(1226.2, data.table(data.frame(NULL)), null.data.table())
test(1226.3, data.table(data.table(NULL)), null.data.table())
test(1226.4, data.table(data.frame()), null.data.table())
test(1226.5, data.table(data.table()), null.data.table())
# fix for bug #59 - POSIXlt issue.
DT1 = data.frame(id=1:3, d=strptime(c("06:02:36", "06:02:48", "07:03:12"), "%H:%M:%S"))
setDT(DT1)
test(1227, data.table(id=1:3, d=strptime(c("06:02:36", "06:02:48", "07:03:12"), "%H:%M:%S")), DT1, warning="POSIXlt column type detected and converted to")
# fix for bug #64 - retaining class of original data.table after passing through `[.data.table`
DT <- data.table(a=1:2,b=3:4)
setattr(DT, "class", c("newclass", class(DT)))
test(1228.1, class(DT), class(DT[a>1]))
test(1228.2, class(DT), class(DT[, list(b)]))
test(1228.3, class(DT), class(DT[, "b", with=FALSE]))
test(1228.4, class(DT), class(DT[, sum(b), by=a]))
test(1228.5, class(DT), class(DT[a>1, sum(b), by=a]))
test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a]))
# test 1229 was non-ASCII, now in package DtNonAsciiTests
# Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050
DT = data.table(a=1:3,b=1:6,key="a")
options(datatable.optimize=1) # turn off GForce, to test dogroups
test(1230, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups")
setkey(DT,NULL)
test(1231, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups")
test(1232, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups")
test(1233, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups")
test(1234, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized
setkey(DT,a)
test(1235, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups")
test(1236, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups")
test(1237, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups")
options(datatable.optimize=Inf)
# check that key is not preserved when length of fastorder is > 0
DT <- data.table(x=1:5, y=6:10, key="x")
test(1238.1, key(setorder(DT, x)), "x")
test(1238.2, key(setorder(DT, -x)), NULL)
# Fix for bug #54 - setkey fails when non-key columns are of type list.
DT <- data.table(x=5:1, y=as.list(1:5))
test(1239.1, setkey(DT, x), setattr(data.table(x=1:5, y=as.list(5:1)), 'sorted', 'x'))
DT <- data.table(x=5:1, y=as.list(1:5))
test(1239.2, setorder(DT, x), data.table(x=1:5, y=as.list(5:1)))
# Fix for bug #43 - order of as.data.table.table is different when doing as.data.table(with(DT, table(x,y)))
set.seed(123)
DT <- data.table(XX = sample(LETTERS[1:5], 1000, replace = TRUE), yy = sample(1:5, 1000, replace = TRUE))
ans1 <- as.data.table(DT[, table(XX, yy)])
ans2 <- as.data.table(table(DT$XX, DT$yy))
setnames(ans1, 'N', 'Freq')
setnames(ans2, names(ans1))
test(1240.1, ans1, setDT(as.data.frame(with(DT, table(XX, yy)), stringsAsFactors=FALSE)))
test(1240.2, ans2, ans1)
# R 3.3.0 started to use data.table's radix sort by default for order() on integer/factors.
# Therefore we check against the non-data.table method ('shell') for correctness (otherwise we'd be
# checking data.table code against itself) as well as checking data.table's ported code in R;
# i.e. a three-way match.
if (base::getRversion() < "3.3.0") {
base_order <- base::order
} else {
base_order <- function(..., na.last=TRUE, method=c("shell","radix")) {
ans1 = base::order(..., na.last=na.last, method="shell")
if (!is.na(na.last) || base::getRversion()>"3.3.3") {
ans2 = base::order(..., na.last=na.last, method="radix")
if (!identical(ans1,ans2)) stop("Base R's order(,method='shell') != order(,method='radix')")
} else {
# Only when na.last=NA in just R 3.3.0-3.3.3 we don't check shell==radix
# because there was a problem in base R's port of data.table code then when :
# 1) 2 or more vectors were passed to base::order(,method="radix")
# AND 2) na.last=NA
# AND 3) there is a subgroup of size exactly 2
# AND 4) one of those 2 items in the subgroup is NA and the other is not NA
# See tests 1728.3 and 1728.13.
}
ans1
}
}
# Test for optimisation of 'order' to 'forder'. Copied to benchmarks.Rraw too.
set.seed(45L)
DT = data.table(x=sample(1e2, 1e5, TRUE), y=sample(1e2, 1e5, TRUE))
test(1241, DT[order(x,-y)], # optimized to forder()
DT[base_order(x,-y)]) # not optimized
DT = data.table(a=1:3, b=4:6)
myCol = "a"
test(1242.1, DT[2,myCol:=6L,with=FALSE], data.table(a=INT(1,6,3), b=4:6), warning="with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please")
test(1242.2, DT[2,(myCol):=7L], data.table(a=INT(1,7,3), b=4:6))
# consistency of output type of mult, #340
DT = data.table(id=rep(1:2,each=2), var=rnorm(4), key="id")
test(1243, DT[.(1:2), list(var)][c(2,4)], DT[.(1:2), list(var), mult="last"])
test(1244, DT[.(1:2), var], DT$var)
test(1245, DT[.(1:2), var][c(2,4)], DT[.(1:2), var, mult="last"])
#############################################
# FR #347 - fromLast argument to duplicated
#############################################
seed = as.integer(Sys.time())
seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="")
set.seed(seed)
DT <- data.table(w=sample(-5:5, 100, TRUE),
x=as.numeric(sample(-5:5, 100, TRUE)),
y=sample(paste("id", 1:10, sep=""), 100, TRUE),
z=sample(c(TRUE, FALSE), 100, TRUE))
colorder=sample(ncol(DT))
setcolorder(DT, names(DT)[colorder])
seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="")
test_no = 1246.0
oldnfail = nfail
for (i in seq_along(names(DT))) {
cc = combn(names(DT), i)
apply(cc, 2L, function(jj) {
test_no <<- signif(test_no+.01, 7) # first without key
test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE))
test_no <<- signif(test_no+.01, 7)
setkeyv(DT, jj) # with key
test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE))
})
}
if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce
# with NA
DT <- data.table(w=sample(c(-5:5,NA_integer_), 100, TRUE),
x=as.numeric(sample(c(-5:5, NA), 100, TRUE)),
y=sample(c(NA, paste("id", 1:10, sep="")), 100, TRUE),
z=sample(c(NA, TRUE, FALSE), 100, TRUE))
colorder=sample(ncol(DT))
setcolorder(DT, names(DT)[colorder])
seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="")
oldnfail = nfail
for (i in seq_along(names(DT))) {
cc = combn(names(DT), i)
apply(cc, 2L, function(jj) {
test_no <<- signif(test_no+.01, 7) # first without key
test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE))
test_no <<- signif(test_no+.01, 7)
setkeyv(DT, jj) # with key
test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE))
})
}
if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce
# FR #350 - anyDuplicated.data.table
set.seed(45L)
dt <- data.table(x=sample(3,10,TRUE), y=sample(letters[1:3], 10,TRUE))
test(1247.1, anyDuplicated(dt), anyDuplicated.data.frame(dt))
test(1247.2, anyDuplicated(dt, fromLast=TRUE), anyDuplicated.data.frame(dt, fromLast=TRUE))
test(1247.3, anyDuplicated(dt, by="y"), anyDuplicated.data.frame(dt[, "y", with=FALSE]))
test(1247.4, anyDuplicated(dt, by="y", fromLast=TRUE), anyDuplicated.data.frame(dt[, "y", with=FALSE], fromLast=TRUE))
# Fix for #39 - j-expression y * eval(parse(..)) should work without needing "("
DT <- data.table(x = seq(1,10,1), y = seq(2,20,2))
test(1248.1, DT[, y := y * eval(parse(text="1*2"))], data.table(x=seq(1,10,1), y=seq(4,40,4)))
# fix in 1248 was not complete. resurfaced again as bug #30. Fixed now, test added here below:
DT <- data.table(id=1:5, var=letters[1:5])
ans <- copy(DT)
idPrefix <- "va" # if this variable were named 'id' then the paste(id) below would see the 'id' _column_.
test(1248.2, DT[, eval(parse(text=paste(idPrefix,"r",sep="")))], letters[1:5])
test(1248.3, DT[, id2:=eval(parse(text=paste(idPrefix,"r",sep="")))], ans[, id2 := var])
# test to make sure DT[order(...)] works fine when it's already sorted (forgot the case where forder returns integer(0) before)
DT <- data.table(x=rep(1:4, each=5), y=1:20)
test(1249.1, DT[order(x)], DT)
test(1249.2, DT[order(y)], DT)
test(1249.3, DT[order(x,y)], DT)
# Fix for #38 - duplicated 'by=FALSE' inconsistency
set.seed(1L)
DT <- data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE), key="x")
test(1250.1, duplicated(DT, by=NULL), duplicated.data.frame(DT))
test(1250.2, duplicated(DT, by=FALSE), error="argument specifying columns must be character or numeric")
test(1250.3, duplicated(DT, by=TRUE), error="argument specifying columns must be character or numeric")
# more tests for DT[order(...)] - now testing 'decreasing=FALSE/TRUE' argument
set.seed(1L)
DT <- data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE))
test(1251.01, DT[order(x,y,decreasing=TRUE)], DT[order(-x,-y)])
test(1251.02, DT[order(x,-y,decreasing=TRUE)], DT[order(-x,y)])
# test in case of complex calls. check out the note in setkey.R under 'forder' for differences in forder and order for 'list' inputs. base is inconsistent I find.
ix = with(DT, order(x+y))
test(1251.03, DT[order(x+y)], DT[ix])
ix = with(DT, order(-x-y))
test(1251.04, DT[order(-x-y)], DT[ix])
ix = with(DT, order(x+y, decreasing=TRUE))
test(1251.05, DT[order(x+y, decreasing=TRUE)], DT[ix])
ix = with(DT, order(4*x-5*y, decreasing=TRUE))
test(1251.06, DT[order(4*x-5*y, decreasing=TRUE)], DT[ix])
ix = with(DT, order(1-DT$x, decreasing=TRUE))
test(1251.07, DT[order(1-DT$x, decreasing=TRUE)], DT[ix])
test(1251.08, DT[order(x, list(-y), decreasing=TRUE)],
error = "Column 2 is length 1 which differs from length of column 1.*10")
test(1251.09, DT[base::order(x, list(-y), decreasing=TRUE)],
error = "argument lengths differ") # data.table's error is more helpful than base's
# more "edge cases" to ensure we're consistent with base
test(1251.10, DT[order("a")], DT[1L])
test(1251.11, DT[order("b", "a")], DT[1L])
test(1251.12, DT[order(list("b", "a"))], DT[1L])
test(1251.13, DT[order(list("b"), list("a"))], error="The first item passed to [f]order is a plain list but there are more items. It should be a data.table or data.frame.")
##############################################################
# extensive tests for order optimisation within `[.data.table`
##############################################################
seed = as.integer(Sys.time())
# This choice of seed by Arun was very good as it revealed problems that a fixed seed would not.
# Test 1844 is now added to consistently run the rare cases discovered here (depending on the seed) to cover all lines in
# forder consistently to save pull requests failing coverage tests randomly, issue #2346
seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="")
set.seed(seed)
# these variable try to simulate groups of length 1, 2, < 200, > 200 so as to cover all different internal implementations
foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, sep="")
i1 = as.integer(sample(rep(c(-3:3, NA_integer_), c(1, 2, 190, 300, 7, 190, 210, 100))))
i2 = as.integer(sample(rep(c(-2:2, -1e6, 1e6, NA_integer_), c(1, 2, 190, 300, 7, 190, 210, 100))))
d1 = as.numeric(sample(rep(c(-2:2,Inf,-Inf, NA_real_, 5, -1e3), c(1, 190, 2, 300, 7, 50, 50, 100, 150, 150))))
c1 = sample(rep(c(letters[1:5], NA_character_, "z"), c(1, 2, 190, 7, 300, 200, 300)))
c2 = sample(c(foo(200), NA_character_), 1e3, TRUE)
DT = data.table(i1, i2, d1, c1, c2)
# randomise col order as well
colorder=sample(ncol(DT))
setcolorder(DT, names(DT)[colorder])
seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="")
ans = vector("list", length(names(DT)))
test_no = 1252
oldnfail = nfail
for (i in seq_along(names(DT))) {
cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i)))
ans[[i]] = combn(names(DT), i, function(x) {
tmp = apply(cj, 1, function(y) {
test_no <<- signif(test_no+.001, 7)
ll = as.call(c(as.name("base_order"),
lapply(seq_along(x), function(j) {
if (y[j] == 1L)
as.name(x[j])
else {
if (class(DT[[x[j]]]) =="character")
as.call(c(as.name("-"), as.call(list(as.name("xtfrm"), as.name(x[j])))))
else
as.call(list(as.name("-"), as.name(x[j])))
}
})
))
ans1 = forderv(DT, by=x, order=y, na.last=TRUE) # adding tests for both nalast=TRUE and nalast=NA
test(test_no, ans1, with(DT, eval(ll)))
test_no <<- signif(test_no+.001, 7)
ll <- as.call(c(as.list(ll), na.last=NA))
ans1 = forderv(DT, by=x, order=y, na.last=NA) # nalast=NA here.
test(test_no, ans1[ans1 != 0], with(DT, eval(ll)))
})
dim(tmp)=NULL
list(tmp)
})
}
ans = NULL
if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce
###############
old_rounding = getNumericRounding()
# turning off tolerance for UPCs (> 11 s.f. stored in numeric), #342
DT <- data.table(upc = c(301426027592, 301426027593, 314775802939, 314775802940, 314775803490, 314775803491, 314775815510, 314775815511, 314933000171, 314933000172),
year = 2006:2007)
setNumericRounding(2L)
test(1253, DT[,.N,by=upc]$N, rep.int(2L,5L))
setNumericRounding(0)
test(1254, DT[,.N,by=upc], data.table(upc=DT$upc, N=1L))
test(1255, unique(DT, by="upc"), DT)
setNumericRounding(2)
test(1256, DT[,.N,by=upc]$N, rep.int(2L,5L))
DT = data.table(upc=rep(c(360734147771, 360734147770), each=3), year=rep(2009:2011, times=2))
setNumericRounding(0)
test(1257, DT[,.N,by=upc], data.table(upc=c(360734147771, 360734147770), N=3L))
test(1258, DT[,.N,by=upc][order(upc)], data.table(upc=c(360734147770, 360734147771), N=3L))
setNumericRounding(1)
test(1259, DT[,.N,by=upc], data.table(upc=c(360734147771, 360734147770), N=3L))
test(1260, DT[,.N,by=upc][order(upc)], data.table(upc=c(360734147770, 360734147771), N=3L))
test(1261, getNumericRounding(), 1L)
# the limit of double precision (16 s.f.) ...
if (test_longdouble) {
test(1262, length(unique(c(1.2345678901234560, 1.2345678901234561, 1.2345678901234562, 1.2345678901234563))), 2L)
# 2 not 4 is double precision limit which base::unique() relies on in this test
# valgrind will also return (3) instead of (2) here due to floating point precision limitation.
# changing the last two values to 1.2345678901234563 and 1.2345678901234564 returns 2.
}
DT = data.table(id=c(1.234567890123450, 1.234567890123451, 1.234567890123452, 1.234567890123453)) # one less digit is limit
test(1263, length(unique(DT$id)), 4L)
test(1264, DT[,.N,by=id]$N, 4L) # 1 byte rounding isn't enough
setNumericRounding(0)
test(1265, DT[,.N,by=id]$N, INT(1,1,1,1))
test(1266, getNumericRounding(), 0L)
setNumericRounding(old_rounding)
# fread reading NA in logical columns, #567
DF = data.frame(I=1:3, L=c(TRUE,FALSE,NA), R=3.14)
write.csv(DF,f<-tempfile(),row.names=FALSE)
test(1267.1, fread(f)$L, c(TRUE, FALSE, NA))
test(1267.2, fread(f), as.data.table(read.csv(f)))
unlink(f)
### FR #2722 test begins here ###
#################################
# FR #2722 optimise j=c(lapply(.SD,sum, ...)) - here any amount of such lapply(.SD, ...) can occur and in any order
set.seed(45L)
dt = data.table(a=sample(2,10,TRUE), b=sample(3,10,TRUE), c=sample(4,10,TRUE), d=sample(5,10,TRUE))
dt2 = data.table(x=c(1,1,1,2,2,2), y=1:6)
options(datatable.optimize=0L)
# auto-naming behavior is different for no-optimization case; just check optimization is off
test(1268.01, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off')
test(1268.02, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off')
test(1268.03, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off")
test(1268.04, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off")
test(1268.05, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off")
# newly added tests for #861 -- optimise, but no GForce
test(1268.06, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off")
# don't optimise .I in c(...)
test(1268.07, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off")
options(datatable.optimize=1L)
test(1268.08, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)")
test(1268.09, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE")
test(1268.10, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE')
test(1268.11, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE")
test(1268.12, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE")
test(1268.13, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE")
test(1268.14, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE")
options(datatable.optimize=Inf)
test(1268.15, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1,
output="GForce optimized j to 'list(gmean(b), gmean(c), gmean(d), gsum(b), gsum(c), gsum(d))'")
test(1268.16, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2,
output = "lapply optimization changed j from 'c(lapply(.SD, mean), .N)' to 'list(mean(b), mean(c), mean(d), .N)'")
test(1268.17, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3,
output = "lapply optimization changed j from 'c(list(c), lapply(.SD, mean))' to 'list(c, mean(b), mean(c), mean(d))")
test(1268.18, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4,
output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'")
test(1268.19, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5,
output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'")
test(1268.20, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6,
output = "lapply optimization changed j from 'c(list(sum(d), .I), lapply(.SD, mean))' to 'list(sum(d), .I, mean(b), mean(c), mean(d))'")
test(1268.21, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7,
output = "lapply optimization is on, j unchanged as 'c(.I, lapply(.SD, mean))'")
test(1268.22, dt[, c(as.list(c), lapply(.SD, mean)), by=a],
error = "j doesn't evaluate to the same number of columns for each group")
### FR #2722 tests end here ###
# Wide range numeric and integer64, to test all bits
old_rounding = getNumericRounding()
x = sample( c(seq(-1e100, 1e100, length.out=1e5), c(seq(-1e-100,1e-100,length.out=1e5))) )
setNumericRounding(0)
test(1269, forderv(x), base::order(x))
setNumericRounding(2) # not affected by rounding
test(1270, forderv(x), base::order(x))
if (test_bit64) {
x = as.integer64(2)^(0:62)
x = sample(c(x,-x,0))
if (!inherits(try(bit64::order(x),silent=TRUE), "try-error")) # if for old version of bit64
test(1271, forderv(x), bit64::order(x))
DT = data.table( a=as.integer64(2)^45 + 1:3, b=1:6 )
test(1272, DT[,sum(b),by=a], data.table(a=DT$a[1:3], V1=INT(5,7,9)))
test(1273, unique(DT, by="a"), DT[1:3])
test(1274, duplicated(DT, by="a"), rep(c(FALSE,TRUE),each=3))
setkey(DT,a)
test(1275, DT[.(as.integer64(35184372088834))], DT[3:4])
test(1276, unique(DT, by=key(DT)), DT[c(1,3,5)])
test(1277, duplicated(DT, by=key(DT)), rep(c(FALSE,TRUE),3))
}
setNumericRounding(old_rounding)
# distinguishing small numbers from 0.0 as from v1.9.2, test from Rick
# http://stackoverflow.com/questions/22290544/grouping-very-small-numbers-e-g-1e-28-and-0-0-in-data-table-v1-8-10-vs-v1-9-2
old_rounding = getNumericRounding()
test_no = 1278.001
for (dround in c(0,2)) {
setNumericRounding(dround) # rounding should not affect the result here because although small, it's very accurace (1 s.f.)
for (i in c(-30:-1,1:30)) {
DT = data.table(c(1 * (10^i),2,9999,-1,0,1))
test(test_no, nrow(DT[, .N, by=V1]), 6L)
test_no = test_no + 0.001
}
}
setNumericRounding(old_rounding)
# rounding of milliseconds, workaround, TO DO: #485
# http://stackoverflow.com/questions/22356957/rounding-milliseconds-of-posixct-in-data-table-v1-9-2-ok-in-1-8-10
old_rounding = getNumericRounding()
DT = data.table(timestamp=as.POSIXct(
c("2013-01-01 17:51:00.707",
"2013-01-01 17:51:59.996",
"2013-01-01 17:52:00.059",
"2013-01-01 17:54:23.901",
"2013-01-01 17:54:23.913",
"2013-01-01 17:54:23.914")))
setNumericRounding(2)
test(1279, duplicated(DT), rep(c(FALSE,TRUE), c(4,2)))
setNumericRounding(1)
test(1280, duplicated(DT), rep(FALSE, 6))
setNumericRounding(old_rounding)
# FR #339, keep.rownames argument for setDT, just for data.frames:
DF <- data.frame(x=1:5, y=10:6)
rownames(DF) <- letters[1:5]
test(1281, setDT(DF, keep.rownames=TRUE), data.table(rn=letters[1:5], x=1:5, y=10:6))
# Bug #42 fix - BY doesn't retain names:
DT <- data.table(fruit=c("apple","peach","pear"))
test(1282, DT[, ans := .BY$fruit, by=fruit], data.table(fruit=DT$fruit, ans=DT$fruit))
# bug #34 - get() doesn't see i's columns, when i is a data.table:
set.seed(1L)
dt1 <- data.table(a=rep(1:2, each=2), c=sample(10,4))
dt2 <- data.table(b=rep(2:3), c=sample(20,2), d=sample(20,2))
setkey(dt1, a)
setkey(dt2, b)
# without by
test(1283.1, dt1[dt2, list(a=a, c=get('c'), i.c=get('i.c'))], dt1[dt2, list(a=a, c=c, i.c=i.c)])
test(1283.2, dt1[dt2, list(a=a, d=get('d'))], dt1[dt2, list(a=a, d=d)])
# with by
test(1283.3, dt1[dt2, list(a=a, c=get('c'), i.c=get('i.c')), by=.EACHI], dt1[dt2, list(a=a, c=c, i.c=i.c), by=.EACHI])
test(1283.4, dt1[dt2, list(a=a, d=get('d')), by=.EACHI], dt1[dt2, list(a=a, d=d), by=.EACHI])
# fix for bug #27 - missed cases like dt[order(abs(x))].
dt <- data.table(x=c(1L,-2L,3L))
test(1284.1, dt[order(abs(x))], dt)
test(1284.2, dt[order(-abs(x))], dt[3:1])
# fix for bug #28 - unique/duplicated on empty data.table returned NA
dt <- data.table(x=numeric(0), y=character(0), key="x")
test(1285.1, duplicated(dt, by=key(dt)), duplicated.data.frame(dt))
test(1285.2, unique(dt, by=key(dt)), dt)
# BUG #24 fix
a <- data.table(BOD, key="Time")
b <- data.table(BOD, key="Time")[Time < 0] # zero row data.table
ans <- merge(b, a, all=TRUE)
test(1287, ans, data.table(Time=a$Time, demand.x=NA_real_, demand.y=a$demand, key="Time"))
# more rbindlist tests - duplicate columns with "fill=TRUE"
ll <- list(data.table(x=1, y=-1, x=-2), data.table(y=10, y=20, y=30, x=-10, a="a", b=Inf, c=factor(1)))
test(1288.01, rbindlist(ll, use.names=TRUE, fill=FALSE), error = "Item 2 has 7 columns, inconsistent with item 1 which has 3 columns")
# modified after fixing #725
test(1288.02, rbindlist(ll, use.names=TRUE, fill=TRUE), # dups were grouped before 1.12.2; now order of dups is retained; #3455
data.table(x=c(1,-10), y=c(-1,10), x=c(-2, NA), y=c(NA,20), y=c(NA,30), a=c(NA, "a"), b=c(NA, Inf), c=factor(c(NA, 1))))
# check the name of output are consistent when binding two empty dts with one empy and other non-empty dt
dt1 <- data.table(x=1:5, y=6:10)
dt2 <- dt1[x > 5]
setnames(dt3 <- copy(dt2), c("A", "B"))
test(1288.03, names(rbindlist(list(dt2,dt3), use.names=FALSE)), c("x", "y")) # use.names=FALSE to avoid new warning in v1.12.2; PR#3455
test(1288.04, names(rbindlist(list(dt3,dt2), use.names=FALSE)), c("A", "B"))
test(1288.05, names(rbindlist(list(dt1,dt3), use.names=FALSE)), c("x", "y"))
test(1288.06, names(rbindlist(list(dt3,dt1), use.names=FALSE)), c("A", "B"))
# check fix for bug #26
DT <- data.table(x=c(1,2,3))
test(1288.07, rbind(DT, DT, data.table()), rbind(DT, data.table(), DT))
# factor on fill=TRUE with NA column..
DT1 = data.table(A=1:3,B=letters[1:3])
DT2 = data.table(B=letters[4:5],C=factor(1:2))
l = list(DT1,DT2)
test(1288.08, rbindlist(l, use.names=TRUE, fill=TRUE), data.table(A=c(1:3,NA_integer_,NA_integer_), B=letters[1:5], C=factor(c(NA,NA,NA,1,2))))
# adding more tests after modifying for better backwards compatibility:
# rbindlist and rbind both work fine even when certain elements of list are not named at all, as long as fill = FALSE, but use.names=TRUE errors when all names are NULL
# when fill=TRUE NO element of the list must have NULL names.
ll <- list(list(1:3, 4:6), list(5:7, 8:10))
test(1288.09, rbindlist(ll), data.table(V1=c(1:3, 5:7), V2=c(4:6, 8:10)))
test(1288.10, rbindlist(ll, use.names=TRUE), error="use.names=TRUE but no item of input list has any names")
ll <- list(list(a=1:3, b=4:6), list(5:7, 8:10))
test(1288.11, rbindlist(ll, use.names=TRUE), data.table(a=c(1:3, 5:7), b=c(4:6, 8:10)))
ll <- list(list(1:3, 4:6), list(a=5:7, b=8:10))
test(1288.12, rbindlist(ll, use.names=TRUE), data.table(a=c(1:3, 5:7), b=c(4:6, 8:10)))
ll <- list(list(a=1:3, 4:6), list(5:7, b=8:10))
test(1288.13, rbindlist(ll, use.names=TRUE), error="Column 2 ['b'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA")
ll <- list(list(a=1:3, 4:6), list(5:7, b=8:10))
test(1288.14, rbindlist(ll, fill=TRUE), data.table(a=c(1:3, rep(NA_integer_,3L)), V1=c(4:6,5:7), b=c(rep(NA_integer_, 3L), 8:10)))
ll <- list(list(1:3, 4:6), list(5:7, 8:10))
test(1288.15, rbindlist(ll, fill=TRUE), error="use.names=TRUE but no item of input list has any names")
ll <- list(list(1:3, 6:8), list(a=4:5, b=9:10))
test(1288.16, rbindlist(ll), data.table(a=1:5, b=6:10))
test(1288.17, rbindlist(ll, fill=TRUE), data.table(a=1:5, b=6:10))
# fix for #25
dt = data.table(x=1L, y=1:10)
test(1289.1, dt[, z := c(rep(NA,5), y), by=x], error="Supplied 15 items to be assigned to group 1 of size 10 in column 'z'")
test(1289.2, names(dt), c("x","y"))
dt = data.table(x=1:2, y=1:10)
test(1289.3, dt[, z := c(rep(NA,5), y), by=x], error="Supplied 10 items to be assigned to group 1 of size 5 in column 'z'")
test(1289.4, names(dt), c("x","y"))
########################################
# Extensve testing for "duplicate" names
########################################
# Rules: Basically, if index is directly given in 'j', just those columns are touched/operated on. But if 'column' names are given and there are more than one
# occurrence of that column, then it's hard to decide which to keep and which to remove. So, to remove, all are removed, to keep, always the first is kept.
# 1) when i,j,by are all absent (or) just 'i' is present then ALL duplicate columns are returned.
# 2) When 'with=FALSE' and 'j' is a character and 'notj' is TRUE, all instances of the column to be removed will be removed.
# 3) When 'with=FALSE' and 'j' is a character and 'notj' is FALSE, only the first column will be recognised in presence of duplicate columns.
# 4) When 'with=FALSE' and 'j' is numeric and 'notj' is TRUE, just those indices will be removed.
# 5) When 'with=FALSE' and 'j' is numeric and 'notj' is FALSE, all columns for indices given, if valid, are returned. (FIXES #22)
# 6) When .SD is in 'j', but '.SDcols' is not present, ALL columns are subset'd - FIXES BUG #86.
# 7) When .SD and .SDcols are present and .SDcols is numeric, columns corresponding to the given indices are returned.
# 8) When .SD and .SDcols are present and .SDcols is character, duplicate column names will only return the first column, each time.
# 9) When .SD and .SDcols are present and .SDcols is numeric, and it's -SDcols, then just those columns are removed.
# 10) When .SD and .SDcols are present and .SDcols is character and -SDcols, then all occurrences of that object is removed.
# 11) When no .SD and no .SDcols and no with=FALSE, only duplicate column names will return only the first column each time.
# 12) With 'get("col")', it's the same as with all character types.
# 13) A logical expression in 'j'.
# 14) Finally, no tests but.. using 'by' with duplicate columns and aggregating may not return the intended result, as it may operate on column names in some cases.
# All points are tested with this example:
DT <- data.table(x=1:2, y=3:4, x=5:6, x=7:8, y=9:10, z=11:12)
DT1 <- data.table(x=1L, y=3L, x=5L, x=7L, y=9L, z=11L)
DT2 <- data.table(x=2L, y=4L, x=6L, x=8L, y=10L, z=12L)
ll <- list(x=1:2, y=3:4, x=5:6, x=7:8, y=9:10, z=11:12)
# case (1)
test(1290.01, DT[1], DT1)
test(1290.02, DT[], DT)
test(1290.03, DT[(TRUE)], DT)
# case (2)
test(1290.04, DT[, !"x", with=FALSE], as.data.table(ll[c(2,5,6)]))
test(1290.05, DT[, !"y", with=FALSE], as.data.table(ll[c(1,3,4,6)]))
test(1290.06, DT[, !c("x", "x"), with=FALSE], as.data.table(ll[c(2,5,6)]))
test(1290.07, DT[, !c("y", "y"), with=FALSE], as.data.table(ll[c(1,3,4,6)]))
# case (3)
test(1290.09, DT[, "x", with=FALSE], as.data.table(ll[1]))
test(1290.10, DT[, "y", with=FALSE], as.data.table(ll[2]))
test(1290.11, DT[, c("x", "x"), with=FALSE], as.data.table(ll[c(1,1)]))
test(1290.12, DT[, c("y", "y"), with=FALSE], as.data.table(ll[c(2,2)]))
# case (4)
test(1290.13, DT[, !3, with=FALSE], as.data.table(ll[c(1,2,4,5,6)]))
test(1290.14, DT[, !c(1,1,3,4), with=FALSE], as.data.table(ll[c(2,5,6)]))
test(1290.15, DT[, !2, with=FALSE], as.data.table(ll[c(1,3,4,5,6)]))
test(1290.16, DT[, !c(2,5,2), with=FALSE], as.data.table(ll[c(1,3,4,6)]))
# case (5)
test(1290.17, DT[, 3, with=FALSE], as.data.table(ll[3]))
test(1290.18, DT[, c(1,1,3,4), with=FALSE], as.data.table(ll[c(1,1,3,4)]))
test(1290.19, DT[, 2, with=FALSE], as.data.table(ll[2]))
test(1290.20, DT[, c(2,5,2), with=FALSE], as.data.table(ll[c(2,5,2)]))
# case (6)
test(1290.21, DT[, .SD], as.data.table(ll))
test(1290.22, DT[, .SD[1]], DT[1])
test(1290.23, DT[, .SD[1, !3, with=FALSE]], as.data.table(DT[1, !3, with=FALSE]))
# case (7)
test(1290.24, DT[, .SD, .SDcols=c(1,1,3,4)], as.data.table(ll[c(1,1,3,4)]))
# case (8)
test(1290.25, DT[, .SD, .SDcols=c("x", "x", "y")], as.data.table(ll[c(1,1,2)]))
# case (9)
test(1290.26, DT[, .SD, .SDcols=-c(1,2)], as.data.table(ll[c(-(1:2))]))
# case (10)
test(1290.27, DT[, .SD, .SDcols=-c("x")], as.data.table(ll[c(2,6)]))
# case (11)
test(1290.28, DT[, x], ll[[1]])
test(1290.29, DT[, list(x,x,y,y,y)], as.data.table(ll[c(1,1,2,2,2)]))
test(1290.30, DT[, list(x,x,y)], as.data.table(ll[c(1,1,2)]))
# cast (12)
test(1290.31, DT[, get("x")], ll[[1]])
test(1290.32, DT[, list(get("x"))], setnames(as.data.table(ll[1]), "V1"))
test(1290.33, DT[, list(get("x"), get("y"))], setnames(as.data.table(ll[1:2]), c("V1", "V2")))
# case (13)
test(1290.34, DT[, names(DT) == "x", with=FALSE], as.data.table(ll[c(1,3,4)]))
# Bug #49.. DT[, bla ;= character(0), by=.] dint add new column when `DT is empty DT.
dt1 = data.table(a=character(0),b=numeric(0))
ans1 = data.table(a=character(0), b=numeric(0), c=numeric(0))
ans2 = data.table(a=character(0), b=numeric(0), c=numeric(0), d=integer(0))
test(1291.1, dt1[, c:=max(b), by='a'], ans1, warning="no non-missing arguments to max")
test(1291.2, dt1[, d := integer(0), by=a], ans2)
# Bug #21
test(1292.1, data.table(x=1:2, y=3:4)[, -(1:2), with=FALSE], null.data.table())
test(1292.2, data.table(x=1:2)[, -1, with=FALSE], null.data.table())
test(1292.3, data.table(x=1:2, y=3:4)[, !c("x","y"), with=FALSE], null.data.table())
test(1292.4, data.table(x=1:2)[, !c("x"), with=FALSE], null.data.table())
# Bug #37 - print.data.table and digits option:
DT <- structure(list(fisyr = 1995:1996, er = list(c(1, 3), c(1, 3)),
eg = c(0.0197315833926059, 0.0197315833926059), esal = list(
c(2329.89763779528, 2423.6811023622), c(2263.07456978967,
2354.16826003824)), fr = list(c(4, 4), c(4, 4)), fg =
c(0.039310363070415,
0.039310363070415), fsal = list(c(2520.85433070866, 2520.85433070866
), c(2448.55449330784, 2448.55449330784)), mr = list(c(5,
30), c(5, 30)), mg = c(0.0197779376457164, 0.0197779376457164
), msal = list(c(2571.70078740157, 4215.73622047244),
c(2497.94263862333,
4094.82600382409))), .Names = c("fisyr", "er", "eg", "esal",
"fr", "fg", "fsal", "mr", "mg", "msal"), class = c("data.table",
"data.frame"), row.names = c(NA, -2L))
ans1 = capture.output(print(DT, digits=4, row.names=FALSE))
ans2 = c(" fisyr er eg esal fr fg fsal mr mg msal",
" 1995 1,3 0.01973 2330,2424 4,4 0.03931 2521,2521 5,30 0.01978 2572,4216",
" 1996 1,3 0.01973 2263,2354 4,4 0.03931 2449,2449 5,30 0.01978 2498,4095")
test(1293, ans1, ans2)
## Fixes bug #35
## Also improves upon bug fix #2551 to provide better warnings and at better places:
dt <- data.table(a=1:3, b=c(7,8,9), c=c(TRUE, NA, FALSE), d=as.list(4:6), e=c("a", "b", "c"))
test(1294.01, dt[, a := 1]$a, rep(1L, 3L))
test(1294.02, dt[, a := 1.5]$a, rep(1L, 3L),
warning="1.5.*double.*position 1 truncated.*integer.*column 1 named 'a'")
test(1294.03, dt[, a := NA]$a, rep(NA_integer_, 3L))
test(1294.04, dt[, a := "a"]$a, rep(NA_integer_, 3L),
warning=c("Coercing 'character' RHS to 'integer'.*column 1 named 'a'",
"NAs introduced by coercion"))
test(1294.05, dt[, a := list(list(1))]$a, rep(1L, 3L),
warning="Coercing 'list' RHS to 'integer' to match.*column 1 named 'a'")
test(1294.06, dt[, a := list(1L)]$a, rep(1L, 3L))
test(1294.07, dt[, a := list(1)]$a, rep(1L, 3L))
test(1294.08, dt[, a := TRUE]$a, rep(1L, 3L))
test(1294.09, dt[, b := 1L]$b, rep(1,3))
test(1294.10, dt[, b := NA]$b, rep(NA_real_,3))
test(1294.11, dt[, b := "bla"]$b, rep(NA_real_, 3),
warning=c("Coercing 'character' RHS to 'double' to match.*column 2 named 'b'",
"NAs introduced by coercion"))
test(1294.12, dt[, b := list(list(1))]$b, rep(1,3),
warning="Coercing 'list' RHS to 'double' to match.*column 2 named 'b'")
test(1294.13, dt[, b := TRUE]$b, rep(1,3))
test(1294.14, dt[, b := list(1)]$b, rep(1,3))
test(1294.15, dt[, c := 1]$c, rep(TRUE, 3))
test(1294.16, dt[, c := 1L]$c, rep(TRUE, 3))
test(1294.17, dt[, c := NA]$c, rep(NA, 3))
test(1294.18, dt[, c := list(1)]$c, rep(TRUE, 3))
test(1294.19, dt[, c := list(list(1))]$c, rep(TRUE, 3),
warning="Coercing 'list' RHS to 'logical' to match.*column 3 named 'c'")
test(1294.20, dt[, c := "bla"]$c, rep(NA, 3),
warning="Coercing 'character' RHS to 'logical'")
test(1294.21, dt[, d := 1]$d, rep(list(1), 3))
test(1294.22, dt[, d := 1L]$d, rep(list(1L), 3))
test(1294.23, dt[, d := TRUE]$d, rep(list(TRUE), 3))
test(1294.24, dt[, d := "bla"]$d, rep(list("bla"), 3))
test(1294.25, dt[, d := list(list(1))]$d, rep(list(1), 3))
test(1294.26, dt[, e := 1]$e, rep("1", 3))
test(1294.27, dt[, e := 1L]$e, rep("1", 3))
test(1294.28, dt[, e := TRUE]$e, rep("TRUE", 3))
test(1294.29, dt[, e := list(list(1))]$e, rep("1", 3), # e.g. revdep NNS does this; PR #3925
warning="Coercing 'list' RHS to 'character' to match.*column 5 named 'e'")
test(1294.30, dt[, e := "bla"]$e, rep("bla", 3))
test(1294.31, dt[, e := list("bla2")]$e, rep("bla2", 3))
if (test_bit64) {
dt[, f:=as.integer64(10:12)]
test(1294.50, dt[, f:=1]$f, as.integer64(rep(1,3)))
test(1294.51, dt[, f:=NA]$f, as.integer64(rep(NA,3)))
test(1294.52, dt[, f:=list(list(1))]$f,
error="Cannot coerce 'list' RHS to 'integer64' to match.*column 6 named 'f'")
}
# FR #343, when LHS evaluates to integer(0), provide warning and return dt, not an error.
dt = data.table(a = 1:5, b1 = 1:5, b2 = 1:5)
test(1295, dt[, grep("c", names(dt)) := NULL], dt, warning="length(LHS)==0; no columns to delete or assign RHS to")
# Updating logical column in one-row DT (corruption of new R 3.1 internal globals for TRUE, FALSE and NA)
DT = data.table(a=1:6, b=c(TRUE,FALSE))
test(1296, DT[,list(b,sum(b)),by=a], data.table(a=1:6, b=c(TRUE,FALSE), V2=c(1L,0L))) # was error "the ... list does not contain 2 elements"
DT = DT[1L]
set(DT,1L,"b",FALSE) # passing 1L as i here is needed to avoid column plonk, so changes the logical singleton in place
test(1297, as.integer(TRUE[1]), 1L) # In R 3.1, TRUE[1] returns the global TRUE but TRUE doesn't yet (parses as new vector)
test(1298, as.integer(TRUE), 1L)
# orignal example, verbatim from James Sams :
upc_table = data.table(upc=1:100000, upc_ver_uc=rep(c(1,2), times=50000), is_PL=rep(c(TRUE, FALSE, FALSE, TRUE), each=25000), product_module_code=rep(1:4, times=25000), ignore.column=2:100001)
test(1299, upc_table[, .N, by=list(upc, upc_ver_uc)][,max(N)], 1L) # all size 1 groups
test(1300, upc_table[, list(is_PL, product_module_code), keyby=list(upc, upc_ver_uc)][,upc[1:3]], 1:3L) # was warning "internal TRUE value has been modified"
rm(list="upc_table")
gc()
# Same test but for singleton small integers which r-devel also plan to globalise internally.
DT = data.table(a=1:6, b=0:1)
test(1301, DT[,list(b,sum(b)),by=a], data.table(a=1:6, b=c(0L,1L), V2=c(0L,1L)))
DT = DT[1L]
set(DT,1L,"b",3L)
test(1302, 0L[1L], 3L-3L)
test(1303, 0L, 3L-3L)
# FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results.
set.seed(2L)
dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10))
options(datatable.optimize = 1L)
ans1 <- dt[, list(.N, sum(y)), by=x]
options(datatable.optimize = 2L)
ans2 <- dt[, list(.N, sum(y)), by=x]
test(1304.1, ans1, ans2)
dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x")
options(datatable.optimize = 1L)
ans1 <- dt[, list(.N, sum(y)), by=x]
options(datatable.optimize = 2L)
ans2 <- dt[, list(.N, sum(y)), by=x]
test(1304.2, ans1, ans2)
# FR #338
DT <- data.table(x=1:5, y=6:10)
test(1305.01, setDF(DT), data.frame(x=1:5, y=6:10))
# setDF should return if input is data.frame, not error.
df <- data.frame(x=1:5, y=6:10)
test(1305.02, setDF(df), df) # setDF works on data.frame
# setDF also works on lists with equal lengths, #1132
df <- list(a=1:5, b=6:10)
test(1305.03, data.frame(df), setDF(df))
df <- list(1:5, 6:10)
test(1305.04, setDF(as.data.table(df)), setDF(df))
test(1305.05, setDF(1:5), error="setDF only accepts")
test(1305.06, setDF(list(1, 2:3)), error="All elements in argument")
# Tests .7 - .13 for FR #1320: setDF accepts rownames argument
dt <- data.table(a=1:5, b=6:10)
df <- data.frame(a=1:5, b=6:10)
lst <- list(a=1:5, b=6:10)
df2 <- data.frame(a=1:5, b=6:10)
rownames(df2) <- LETTERS[1:5]
test(1305.07, setDF(dt, rownames=LETTERS[1:5]), df2)
test(1305.08, setDF(df, rownames=LETTERS[1:5]), df2)
test(1305.09, setDF(lst,rownames=LETTERS[1:5]), df2)
# setDF returns an error for each type if rownames incorrect length
dt <- data.table(a=1:5, b=6:10)
df <- data.frame(a=1:5, b=6:10)
lst <- list(a=1:5, b=6:10)
test(1305.10, setDF(dt, rownames="a"), error='rownames incorrect length')
test(1305.11, setDF(df, rownames="a"), error='rownames incorrect length')
test(1305.12, setDF(lst,rownames="a"), error='rownames incorrect length')
# setDF returns an error when rownames contains duplicates
test(1305.13, setDF(dt, rownames=rep("a",5)), error='rownames contains duplicates')
# .SD retains as much of head(key) as appropriate.
# by= always keeps data appearance order, so it's which columns are grouped and selected that drive how much of key is retained
DT = data.table(a=1:3,b=1:6,c=1:6,key="a,b")
test(1306, DT[1:2,key(.SD)], c("a","b"))
test(1307, DT[2:1,key(.SD)], NULL)
test(1308, DT[,key(.SD),by=a], data.table(a=integer()))
test(1309, DT[,key(.SD),by=b], data.table(b=DT$b, V1="a"))
test(1310, DT[,key(.SD),by=c%%2L], data.table(c=c(1L,1L,0L,0L), V1=c("a","b","a","b")))
test(1311, DT[,list(list(key(.SD))),by=a,.SDcols=1:2], data.table(a=1:3, V1=list(c("a","b")),key="a")) # .SDcols as Arun found
# That setkey can't operate on locked tables such as .SD. Added in v1.9.3.
DT = data.table(a=1:3,b=6:1)
test(1312, DT[,setkey(.SD),by=a], error="Setting a physical key on .SD is reserved for possible future use")
# was warning "Already keyed by this key but had invalid row order" due to the key not being cleared after the previous group. A solution could have been to put back the original key on populating .SD for each group. But instead we reserve it for future use and push the user towards doing it a different more efficient way (see Arun's speedups in the datatable-help thread).
# gmin and gmax extensive testing (because there are tricky cases)
DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647))
# make sure GForce is running
options(datatable.optimize=3L)
# for integers
test(1313.01, DT[, min(y), by=x], DT[, base::min(y), by=x])
test(1313.02, DT[, max(y), by=x], DT[, base::max(y), by=x])
test(1313.03, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x])
test(1313.04, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x])
# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median)
DT[x==6, y := INT(NA)]
test(1313.05, DT[, min(y), by=x], DT[, base::min(y), by=x])
test(1313.06, DT[, max(y), by=x], DT[, base::max(y), by=x])
test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-2147483647,Inf)), warning="No non-missing")
test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-2147483647,-Inf)), warning="No non-missing")
# for numeric
DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA))
test(1313.09, DT[, min(y), by=x], DT[, base::min(y), by=x])
test(1313.10, DT[, max(y), by=x], DT[, base::max(y), by=x])
test(1313.11, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x])
test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x])
# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median)
DT[x==6, y := NA_real_]
test(1313.13, DT[, min(y), by=x], DT[, base::min(y), by=x])
test(1313.14, DT[, max(y), by=x], DT[, base::max(y), by=x])
test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,Inf)), warning="No non-missing")
test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,-Inf)), warning="No non-missing")
# for date (attribute check.. especially after issues/689 !!!)
DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400))
test(1313.17, DT[, list(y=min(y)), by=x], DT[c(1,6)])
test(1313.18, DT[, list(y=max(y)), by=x], DT[c(5,10)])
DT[c(1,6), y := NA]
test(1313.19, DT[, list(y=min(y)), by=x], DT[c(1,6)])
test(1313.20, DT[, list(y=max(y)), by=x], DT[c(1,6)])
test(1313.21, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)])
test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)])
# for character
set.seed(1L)
DT <- data.table(x=rep(1:6, each=3), y=sample(c("", letters[1:3], NA), 18, TRUE))
test(1313.23, DT[, min(y), by=x], DT[, base::min(y), by=x])
test(1313.24, DT[, max(y), by=x], DT[, base::max(y), by=x])
test(1313.25, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x])
test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x])
DT[x==6, y := NA_character_]
test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x])
test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x])
test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("a","a","c","","a",NA)), warning="No non-missing")
test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("b","a","c","a","c",NA)), warning="No non-missing")
# bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once
dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names =
c("x", "y", "z"), class = "data.frame", row.names = c(NA, -2L)), key = "x,y")
dt2 <- data.table(structure(list(x = c(7L, 7L, 33L, 33L, 33L, 33L), y = structure(c(15884, 15917, 15884, 15884, 15917, 15917), class = "Date"), w = c(-0.118303, 0.141225, -0.03137, -0.02533, 0.045967, 0.043694)), .Names = c("x", "y", "w"), class = "data.frame", row.names = c(NA, -6L)), key = "x,y")
test(1317.1, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[2], dt2$w[5:6]), key="x,y"))
# also test where 'i' is not sorted.
set.seed(1L)
dt2 <- dt2[sample(nrow(dt2))] # key should be gone
test(1317.2, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[1], dt2$w[c(2,6)])))
# bug fix for #472 : "parse" in j
set.seed(100)
nrow <- 100L
DT <- data.table(aa = sample(letters[1:5], nrow, replace = TRUE), bb = rnorm(nrow))
sumExpr <- parse(text = "sum(bb, na.rm = TRUE)")
meanExpr <- parse(text = "mean(bb, na.rm = TRUE)")
test(1318.1, DT[, eval(sumExpr), by = aa], DT[, sum(bb, na.rm=TRUE), by=aa])
test(1318.2, DT[, eval(meanExpr), by = aa], DT[, mean(bb, na.rm=TRUE), by=aa])
test(1318.3, DT[, list(mySum = eval(sumExpr), myMean = eval(meanExpr)), by = aa], DT[, list(mySum=sum(bb, na.rm=TRUE), myMean=mean(bb, na.rm=TRUE)), by=aa])
# get DT[order(.)] to make sense. In v1.12.4 these tests were changed to not be 100% consistent with base in
# cases where the base R behaviour doesn't make sense, #696
DT <- data.table(a = 1:4, b = 8:5, c=letters[4:1])
test(1319.1, DT[order(DT[, "b", with=FALSE])], DT[base::order(DT[, "b", with=FALSE])])
test(1319.2, DT[order(DT[, "c", with=FALSE])], DT[base::order(DT[, "c", with=FALSE])])
test(1319.3, DT[order(DT[, c("b","c"), with=FALSE])], DT[4:1]) # DT[base::order(DT[, c("b","c"), with=FALSE])])
test(1319.4, DT[order(DT[, c("c","b"), with=FALSE])], DT[4:1]) # DT[base::order(DT[, c("c","b"), with=FALSE])])
test(1319.5, DT[order(DT[, "b", with=FALSE], DT[, "a", with=FALSE])], error="Column 1 passed to [f]order is type 'list', not yet supported")
test(1319.6, DT[order(list(DT$a))], DT)
test(1319.7, DT[order(list(DT$a), list(DT$b))], error="The first item passed to [f]order is a plain list but there are more items. It should be a data.table or data.frame.")
test(1319.8, DT[order(list(DT$a, DT$b))], DT) # error="Column 1 of by= (1) is type 'list', not yet supported")
# FR #703. Not so extensive testing because test 1223 already tests for everything else extensively. Only integer64 here.
# this'll be the test for both DT[order(.)] and setorder(.) as both internally uses forder/forderv
if (test_bit64) {
set.seed(45L)
DT <- data.table(x=as.integer64(c(-50, 0, 50, 1e18, 1e-18)), y=sample(5))
ans1 <- forder(DT, x, na.last=TRUE, decreasing=FALSE)
ans2 <- forder(DT, x, na.last=FALSE, decreasing=FALSE)
ans3 <- forder(DT, x, na.last=TRUE, decreasing=TRUE)
ans4 <- forder(DT, x, na.last=FALSE, decreasing=TRUE)
test(1320.01, ans1, as.integer(c(1,2,5,3,4)))
test(1320.02, ans2, as.integer(c(1,2,5,3,4)))
test(1320.03, ans3, as.integer(c(4,3,2,5,1)))
test(1320.04, ans4, as.integer(c(4,3,2,5,1)))
set.seed(45L)
DT <- data.table(x=as.integer64(c(-50, 0, NA, 50, 1e18, NA, 1e-18)), y=sample(7))
ans1 <- forder(DT, x, na.last=TRUE, decreasing=FALSE)
ans2 <- forder(DT, x, na.last=FALSE, decreasing=FALSE)
ans3 <- forder(DT, x, na.last=TRUE, decreasing=TRUE)
ans4 <- forder(DT, x, na.last=FALSE, decreasing=TRUE)
test(1320.05, ans1, as.integer(c(1,2,7,4,5,3,6)))
test(1320.06, ans2, as.integer(c(3,6,1,2,7,4,5)))
test(1320.07, ans3, as.integer(c(5,4,2,7,1,3,6)))
test(1320.08, ans4, as.integer(c(3,6,5,4,2,7,1)))
# missed test - checking na.last=NA!
set.seed(45L)
DT <- data.table(x=as.integer64(c(-50, 0, NA, 50, 1e18, NA, 1e-18)), y=sample(7))
ans1 <- forder(DT, x, na.last=NA, decreasing=FALSE)
ans2 <- forder(DT, x, na.last=NA, decreasing=TRUE)
test(1320.09, ans1, as.integer(c(0,0,1,2,7,4,5)))
test(1320.10, ans2, as.integer(c(0,0,5,4,2,7,1)))
}
# fread newlines inside quoted fields
test(1321, fread('A,B,C\n1,"foo\nbar",3\n4,baz,6'), data.table(A=c(1L,4L), B=c("foo\nbar","baz"), C=c(3L,6L)))
test(1322, fread('A,B,C\n1,"foo
bar",3\n4,baz,6'), data.table(A=c(1L,4L), B=c("foo\nbar","baz"), C=c(3L,6L)))
# NB: don't remove the newline after foo in test 1322 above, that's what's being tested.
test(1323, fread('col1,col2\n5,"4\n3"'), data.table(col1=5L, col2="4\n3")) # no warning as last field is finished ok
test(1324, fread('A,B,C\n1,4,"foo"\n2,5,"bar'), data.table(A=1:2,B=4:5,C=c('foo','"bar')))
test(1325, fread('A,B,C\n1,4,"foo"\n2,5,"bar"'), data.table(A=1:2,B=4:5,C=c("foo",'bar')))
test(1326, fread('A,B,C\n1,4,"foo"\n2,5,bar"'), data.table(A=1:2,B=4:5,C=c("foo",'bar"')))
test(1327, fread('A,B,C\n1,4,"foo"\n2,5,""bar""'), data.table(A=1:2,B=4:5,C=c("foo",'"bar"')), warning="resolved improper quoting")
cat('A,B\n2,"Joe \\",Bloggs"', file = f<-tempfile())
test(1328, fread(f), data.table(A=2L, B='Joe \\",Bloggs'))
cat('A,B\n2,"Joe \\",Bloggs"\n', file = f<-tempfile())
test(1328.2, fread(f), data.table(A=2L, B='Joe \\",Bloggs'))
unlink(f)
test(1329, fread(), error="empty")
# add test that that escaped escapes at the end of a quoted field
test(1330, fread('A,B\nfoo,1\nAnalyst\\,2\nbar,3'), data.table(A=c('foo','Analyst\\','bar'), B=1:3))
test(1331.1, fread('A,B\nfoo,1\nAnalyst\\ ,2\nbar,3'), data.table(A=c('foo','Analyst\\','bar'), B=1:3)) # strip.white=TRUE
test(1331.2, fread('A,B\nfoo,1\nAnalyst\\ ,2\nbar,3', strip.white=FALSE), data.table(A=c('foo','Analyst\\ ','bar'), B=1:3))
test(1332, fread('A,B\nfoo,1\n"Analyst\\",2\nbar,3'), data.table(A=c('foo','Analyst\\','bar'), B=1:3))
test(1332.2, fread("ab,x\n cd,x ", sep = ",", strip.white = FALSE, header = FALSE), data.table(V1=c("ab", " cd"), V2=c("x", "x "))) # Issue 2376
# double \\ in this file means one in the input, so the above " is escaped by a single '\' but still read ok
test(1333.1, fread('A,B\nfoo,1\n"Analyst\\" ,2\nbar,3'), data.table(A = c("foo", "Analyst\\", "bar"), B = 1:3))
test(1333.2, fread('A,B\nfoo,1\n"Analyst\\" ,2\nbar,3', strip.white=FALSE), data.table(A = c("foo", "Analyst\\", "bar"), B = 1:3)) # it's a quoted field with space afterwards; strip.white only applies to non-quoted strings
test(1334, fread('A,B\nfoo,1\n"Analyst\\" ,",2\nbar,3'), data.table(A=c('foo', 'Analyst\\" ,', 'bar'), B=1:3))
test(1335, fread('A,B\nfoo,1\n"Analyst\\\\",2\nbar,3'), data.table(A=c('foo','Analyst\\\\','bar'), B=1:3))
# data from 12GB file in comments on http://stackoverflow.com/a/23858323/403310 ...
# note that read.csv gets this wrong and puts jacoleman high school into the previous field, then fills the rest of the line silently.
cat('A,B,C,D,E,F
"12",0,"teacher private nfp\\\\\\\\"",""jacoleman high school","",""
"TX",77406,"business analyst\\\\\\\\\\\\\\","the boeing co","",""
"CA",94116,"na\\none","retired","",""
', file = f<-tempfile()) # aside: notice the \\ before n of none as well
test(1336.1, fread(f),
data.table(A = c("12", "TX", "CA"),
B = c(0L, 77406L, 94116L),
C = c("teacher private nfp\\\\\\\\\"", "business analyst\\\\\\\\\\\\\\", "na\\none"),
D = c("\"jacoleman high school", "the boeing co", "retired"),
E = NA,
F = NA),
warning="resolved improper quoting")
cat('A,B,C,D,E,F
"12",0,"teacher private nfp\\\\\\\\"","jacoleman high school","",""
"TX",77406,"business analyst\\\\\\\\\\\\\\","the boeing co","",""
"CA",94116,"na\\none","retired","",""
', file = f)
test(1336.2, fread(f),
data.table(A=c("12","TX","CA"),
B=c(0L,77406L,94116L),
C=c('teacher private nfp\\\\\\\\"','business analyst\\\\\\\\\\\\\\','na\\none'),
D=c('jacoleman high school','the boeing co','retired'),
E=NA,
F=NA),
warning="resolved improper quoting")
unlink(f)
# file names ending with \ (quite common)
# http://stackoverflow.com/questions/24375832/fread-and-column-with-a-trailing-backslash
cat('file,size\n"windows\\user\\",123\n', file = f<-tempfile())
test(1337, fread(f), data.table(file='windows\\user\\',size=123L))
test(1338, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE)))
unlink(f)
# TO DO, by checking for balanced embedded quotes
# cat('http,size\n"www.blah?x="one",y="two","three"",123\n', file = f<-tempfile())
# read.csv(f) -- unusually, seems to be a case it doesn't handle
# test(1339, fread(f), data.table(http='www.blah?x="one",y="two","three"',size=123L))
# unlink(f)
# FR #706 - setorder and setorderv now has 'na.last=TRUE/FALSE' argument. It can't have value NA though, like `DT[order(.)]` as it reorders by reference, doesn't subset. Simple tests.
set.seed(45L)
DT <- data.table(x=sample(c(-2:2, NA_integer_), 20, TRUE), y=sample(c(-1:1, NA, Inf, -Inf, NaN), 20, TRUE))
test(1340.01, setorder(copy(DT), x, na.last=TRUE ), DT[order( x, na.last=TRUE)])
test(1340.02, setorder(copy(DT), x, na.last=FALSE), DT[order( x, na.last=FALSE)])
test(1340.03, setorder(copy(DT), -x, na.last=TRUE ), DT[order(-x, na.last=TRUE)])
test(1340.04, setorder(copy(DT), -x, na.last=FALSE), DT[order(-x, na.last=FALSE)])
test(1340.05, setorder(copy(DT), y, na.last=TRUE ), DT[order( y, na.last=TRUE)])
test(1340.06, setorder(copy(DT), y, na.last=FALSE), DT[order( y, na.last=FALSE)])
test(1340.07, setorder(copy(DT), -y, na.last=TRUE ), DT[order(-y, na.last=TRUE)])
test(1340.08, setorder(copy(DT), -y, na.last=FALSE), DT[order(-y, na.last=FALSE)])
test(1340.09, setorderv(copy(DT), "x", 1L, na.last=TRUE ), DT[order( x, na.last=TRUE)])
test(1340.10, setorderv(copy(DT), "x", 1L, na.last=FALSE), DT[order( x, na.last=FALSE)])
test(1340.11, setorderv(copy(DT), "x", -1L, na.last=TRUE ), DT[order(-x, na.last=TRUE)])
test(1340.12, setorderv(copy(DT), "x", -1L, na.last=FALSE), DT[order(-x, na.last=FALSE)])
test(1340.13, setorderv(copy(DT), "y", 1L, na.last=TRUE ), DT[order( y, na.last=TRUE)])
test(1340.14, setorderv(copy(DT), "y", 1L, na.last=FALSE), DT[order( y, na.last=FALSE)])
test(1340.15, setorderv(copy(DT), "y", -1L, na.last=TRUE ), DT[order(-y, na.last=TRUE)])
test(1340.16, setorderv(copy(DT), "y", -1L, na.last=FALSE), DT[order(-y, na.last=FALSE)])
test(1340.17, setorder(copy(DT), x, na.last=NA), error="na.last must be logical TRUE/FALSE")
test(1340.18, setorderv(copy(DT), "x", na.last=NA), error="na.last must be logical TRUE/FALSE")
# bug #481 - DT[, list(list(.)), by=.] on R v3.1.0
set.seed(1L)
f <- function(x) list(x)
DT <- data.table(x=sample(3,10,TRUE), y=as.numeric(sample(10)))
test(1341.1, DT[, list(list(y)), by=x], data.table(x=unique(DT$x), V1=list(c(3,5,9), c(2,6,4,1), c(10,7,8))))
test(1341.2, DT[, list(list(.I)), by=x], data.table(x=unique(DT$x), V1=list(c(1,5,10), c(2,3,8,9), c(4,6,7))))
test(1341.3, DT[, list(f(y)), by=x], data.table(x=unique(DT$x), V1=list(c(3,5,9), c(2,6,4,1), c(10,7,8))))
# test for list(list(.)) with :=
test(1341.4, copy(DT)[, z := list(list(y)), by=x], copy(DT)[, z := list(list(copy(y))), by=x])
test(1341.5, copy(DT)[, z := list(list(.I)), by=x], copy(DT)[, z := list(list(copy(.I))), by=x])
test(1341.6, copy(DT)[, z := list(f(y)), by=x], copy(DT)[, z := list(f(copy(y))), by=x])
# test regression on over-allocation (selfref) on unique() which uses new subsetDT()
bla <- data.table(x=c(1,1,2,2), y=c(1,1,1,1))
test(1342, unique(bla)[, bla := 2L], data.table(x=c(1,2),y=1,bla=2L))
# blank and NA fields in logical columns
test(1343.1, fread("A,B\n1,TRUE\n2,\n3,False"), data.table(A=1:3, B=c("TRUE","","False")))
test(1343.2, fread("A,B\n1,True\n2,\n3,false"), data.table(A=1:3, B=c("True","","false")))
test(1343.3, fread("A,B\n1,TRUE\n2,\n3,FALSE"), data.table(A=1:3, B=c(TRUE,NA,FALSE)))
test(1343.4, fread("A,B\n1,True\n2,\n3,False"), data.table(A=1:3, B=c(TRUE,NA,FALSE)))
test(1343.5, fread("A,B\n1,true\n2,\n3,false"), data.table(A=1:3, B=c(TRUE,NA,FALSE)))
test(1343.6, fread("A,B\n1,true\n2,NA\n3,"), data.table(A=1:3, B=c(TRUE,NA,NA)))
test(1344.1, fread("A,B\n1,2\n0,3\n,1\n", logical01=FALSE), data.table(A=c(1L,0L,NA), B=c(2L,3L,1L)))
test(1344.2, fread("A,B\n1,2\n0,3\n,1\n", logical01=TRUE), data.table(A=c(TRUE,FALSE,NA), B=c(2L,3L,1L)))
# .N now available in i
DT = data.table(a=1:3,b=1:6)
test(1348, DT[.N], DT[6])
test(1349, DT[.N-1:3], DT[5:3])
test(1350, DT[.N+1], DT[NA])
# Adding test to catch any future regressions - #734
dt = data.table(id = rep(c('a','b'), each=2), val = rep(c(1,2,3), times=c(1,2,1)))
setkey(dt, id, val)
test(1351.1, dt[J("a"), val], c(1,2))
test(1351.2, dt[J('a'), range(val)], c(1,2))
# New feature: .() in j and .() in by
DT = data.table(a=1:3, b=1:6, c=LETTERS[1:6])
test(1352.1, DT[,.(b)], DT[,list(b)])
test(1352.2, DT[,.(b,c)], DT[,c("b","c"),with=FALSE])
test(1352.3, DT[,.(sum(b)),by=a], DT[,sum(b),by=a])
test(1352.4, DT[,.(MySum=sum(b)), by=a], data.table(a=1:3, MySum=c(5L,7L,9L)))
test(1352.5, DT[,sum(b),by=.(a)], DT[,sum(b),by=a])
test(1352.6, DT[,sum(b),by=.(a%%2)], DT[,sum(b),by=a%%2])
test(1352.7, DT[,sum(b),by=.(Grp=a%%2)], DT[,sum(b),by=list(Grp=a%%2)])
test(1352.8, DT[,sum(b),by=.(a%%2,c)], DT[,sum(b),by=list(a%%2,c)])
# that :=NULL together with i is now an error
DT = data.table(a=1:3, b=1:6)
test(1353.1, DT[2, b:=NULL], error="When deleting columns, i should not be provided")
test(1353.2, DT[2, c("a","b"):=list(42, NULL)], error="When deleting columns, i should not be provided")
# order optimisation caused trouble due to chaining because of 'substitute(x)' usage in [.data.table.
set.seed(1L)
X = data.table(id=1:10, val1=sample(3,10,TRUE))
Y = data.table(val1=1:4, val2=8:5, key="val1")
setkey(X, val1)
test(1354, X[Y, val2 := i.val2, allow.cartesian=TRUE][, val1 := NULL][order(id)], data.table(id=1:10, val2=as.integer(c(8,7,7,6,8,6,6,7,7,8))))
# Fix for #475, setDT(CO2) should error, as it's trying to modify the object whose binding is locked.
# CO2 is not locked in R 2.14.1 but is in R >= 3.1.0. R NEWS isn't clear when that change happened, so just test there is an error when it is locked.
if (bindingIsLocked("CO2",as.environment("package:datasets"))) {
test(1355.1, setDT(CO2), error="Cannot convert 'CO2' to data.table by reference because binding is locked.")
} else {
test(1355.2, setDT(CO2), CO2)
}
# Fix for #698. not join doesn't need to check for allow.cartesian=TRUE.
DT1 <- data.table(x=rep(1:3, each=3L), y=1:9, key="x")
DT2 <- data.table(x=rep(c(3L,1L), each=10), z=1L)
test(1356, DT1[!DT2], data.table(x=2L, y=4:6, key="x"))
# Fix for #745. as.data.table.matrix shouldn't convert character to factor
m <- matrix(letters[1:4], ncol=2)
test(1357, as.data.table(m), data.table(V1=letters[1:2], V2=letters[3:4]))
# Fix for #471. A[A[A]] contains duplicate names in 1.9.3
A <- data.table(foo = 1:2, bar = 3:4)
setkey(A, foo)
test(1358.1, names(A[A[A]]), c("foo", "bar", "i.bar", "i.bar.1"))
test(1358.2, names(A[A[A[A]]]), c("foo", "bar", "i.bar", "i.bar.2", "i.bar.1"))
# Fix for #743. 0 and -0 and the sign bit issue
A <- data.table(x=c(0,0,-1,1,-1,0,-0,1,-1,1,0,1), y=1:12)
test(1359.1, A[, .N, by=x], data.table(x=c(0,-1,1), N=c(5L,3L,4L)))
dt1 <- data.table(x2 = 0L)
dt2 <- data.table(x2 =-(11-11)/10)
test(1359.2, as.integer(merge(dt2, dt1, by="x2")$x2), as.integer(merge(dt1, dt2, by="x2")$x2))
# Fix for #744: X[Y, c(...), by=.EACHI] segfaults because of using 'i' as variable in for-loop that masked the original 'i' from input.
dt <- data.table(id = c("A", "A", "B", "B", "C"), val1=1:5, val2=6:10, key = "id")
sample <- c("A", "B")
test(1360.1, dt[sample, c(.N), by = .EACHI], dt[sample, list(V1=.N), by=.EACHI])
test(1360.2, copy(dt)[sample, N := c(.N), by = .EACHI], copy(dt)[sample, N := .N, by = .EACHI])
# Fix for #500 - `lapply` call shouldn't redirect to `[.data.frame`.
L <- list(data.table(BOD), data.table(BOD))
test(1361, lapply(L, "[", Time==3L), list(L[[1L]][Time == 3L], L[[2L]][Time == 3L]))
# Feature #735, first two cases: 1) .SD, and 2) DT[, c(.SD, lapply(.SD, ...)), by=...] optimisation:
# Don't set options(datatable.verbose=TRUE) here because the "running test 1362.1 ..." messages cause output to scroll away errors on CRAN checks last 13 lines
DT <- data.table(x=c(1,1,1,2,2), y=1:5, z=6:10)
test(1362.01, DT[, .SD, by=x, verbose=TRUE],
output="lapply optimization changed j from '.SD' to 'list(y, z)'")
test(1362.02, DT[, c(.SD), by=x, verbose=TRUE],
output="lapply optimization changed j from 'c(.SD)' to 'list(y, z)'")
test(1362.03, DT[, c(.SD, lapply(.SD, sum)), by=x, verbose=TRUE],
output="lapply optimization changed j from 'c(.SD, lapply(.SD, sum))' to 'list(y, z, sum(y), sum(z))'")
test(1362.04, DT[, c(lapply(.SD, sum), .SD), by=x, verbose=TRUE],
output="lapply optimization changed j from 'c(lapply(.SD, sum), .SD)' to 'list(sum(y), sum(z), y, z)'")
test(1362.05, DT[, c(list(y), .SD, lapply(.SD, sum)), by=x, verbose=TRUE],
output="lapply optimization changed j from 'c(list(y), .SD, lapply(.SD, sum))' to 'list(y, y, z, sum(y), sum(z))'")
# 3) .SD[1] and 4) .SD[1L]
test(1362.06, DT[, c(.SD[1L]), by=x, verbose=TRUE],
output="lapply optimization changed j from 'c(.SD[1L])' to 'list(y[1L], z[1L])'")
test(1362.07, DT[, c(.SD[1L], lapply(.SD, sum)), by=x, verbose=TRUE],
output="lapply optimization changed j from 'c(.SD[1L], lapply(.SD, sum))' to 'list(y[1L], z[1L], sum(y), sum(z))'")
test(1362.08, DT[, c(.SD[.N]), by=x, verbose=TRUE],
output="lapply optimization changed j from 'c(.SD[.N])' to 'list(y[.N], z[.N])'")
test(1362.09, DT[, .SD[1], by=x, verbose=TRUE],
output="lapply optimization changed j from '.SD[1]' to 'list(y[1], z[1])'")
test(1362.10, DT[, c(.SD[1]), by=x, verbose=TRUE],
output="lapply optimization changed j from 'c(.SD[1])' to 'list(y[1], z[1])'")
test(1362.11, DT[, c(.SD[1], lapply(.SD, sum)), by=x, verbose=TRUE],
output="lapply optimization changed j from 'c(.SD[1], lapply(.SD, sum))' to 'list(y[1], z[1], sum(y), sum(z))'")
test(1362.12, DT[, head(.SD, 1), by=x, verbose=TRUE],
output="lapply optimization changed j from 'head(.SD, 1)' to 'list(head(y, 1), head(z, 1))'")
# make sure .I is named as I when no name is given
test(1362.13, names(DT[, c(list(.I, mean(y)), lapply(.SD, sum)), by=x]), c("x", "I", "V2", "y", "z"))
# and if a name is given, it's retained
test(1362.14, names(DT[, c(list(bla=.I, mean(y)), lapply(.SD, sum)), by=x]), c("x", "bla", "V2", "y", "z"))
# Add test to ensure that mean() gets replaced with fastmean when GForce won't be used.
test(1362.15, DT[, c(list(.I, mean(y)), lapply(.SD, mean)), by=x, verbose=TRUE],
output="Old mean optimization changed j from 'list(.I, mean(y), mean(y), mean(z))' to 'list(.I, .External(Cfastmean, y, FALSE), .External(Cfastmean, y, FALSE), .External(Cfastmean, z, FALSE))'")
DT[, w:=c(3i,4i,5i,6i,7i)]
test(1362.16, DT[, .(mean(w), y*2), by=x], error="fastmean was passed type complex, not numeric or logical")
# setDT(DT), when input is already a data.table checks if selfrefok and if not, does alloc.col again.
DT = list(data.frame(x=1:5, y=6:10))
invisible(lapply(DT, setDT))
DT = DT[[1L]]
test(1363.1, selfrefok(DT), 1L)
foo <- function(x) setDT(x)
df = data.frame(x=1, y=2)
foo(df)
test(1363.2, selfrefok(df), 0L)
setDT(df)
test(1363.3, selfrefok(df), 1L)
# setdiff, parly #547. internal as of now, and named setdiff_ because the name "set" can be confused with the set* functions.
# maybe provide a %diff% operator that internally calls setdiff_?? Usage x %diff% y?
X = data.table(a=c(1,1,1,1,3,3,2,2,2))[, `:=`(b=factor(a), c=as.character(a), d = as.integer(a), e=1:9)]
Y = data.table(a=c(3,4), b=factor(3:4), c=c("3","4"), d=3:4, e=c(TRUE, FALSE), f=c(5L,7L))
test(1364.01, setdiff_(X, Y, "a", "a"), data.table(a=c(1,2)))
test(1364.02, setdiff_(X, Y, c("a", "e"), c("a", "f")), X[!5, list(a,e)])
test(1364.03, setdiff_(X, Y, "a", "e"), error="When x's column ('a') is integer or numeric, the corresponding column in y ('e')")
test(1364.04, setdiff_(X, Y, "b", "b"), data.table(b=factor(c(1,2), levels=c(1,2,3))))
test(1364.05, setdiff_(X, Y, c("b", "e"), c("b", "f")), X[!5, list(b,e)])
test(1364.06, setdiff_(X, Y, "b", "c"), data.table(b=factor(c(1,2), levels=c(1,2,3))))
test(1364.07, setdiff_(X, Y, "c", "c"), data.table(c=as.character(c(1,2))))
test(1364.08, setdiff_(X, Y, c("c", "e"), c("c", "f")), X[!5, list(c,e)])
test(1364.09, setdiff_(X, Y, "c", "b"), data.table(c=c("1", "2")))
test(1364.10, setdiff_(X, Y, "d", "d"), data.table(d=1:2))
test(1364.11, setdiff_(X, Y, c("d", "e"), c("d", "f")), X[!5, list(d,e)])
test(1364.12, setdiff_(X, Y, "d", "e"), error="When x's column ('d') is integer or numeric, the corresponding column in y ('e')")
test(1364.13, setdiff_(X, Y, "b", "a"), error="When x's column ('b') is factor, the corresponding column in y ('a')")
test(1364.14, setdiff_(X, Y, "c", "a"), error="When x's column ('c') is character, the corresponding column in y ('a') ")
test(1364.15, setdiff_(X, Y), error="length(by.x) != length(by.y)")
test(1364.16, setdiff_(X[, list(a)], Y[, list(a)]), data.table(a=c(1,2)))
setDF(X)
test(1364.17, setdiff_(X, Y), error = 'x and y must both be data.tables')
setDT(X)
setDF(Y)
test(1364.18, setdiff_(X, Y), error = 'x and y must both be data.tables')
setDT(Y)
test(1364.19, setdiff_(X[0L], Y), X[0L])
test(1364.20, setdiff_(X, Y, by.x = 'f'), error = 'specify non existing column*.*f')
#test(1364.21, setdiff_(X, Y, by.x = c('f', 'g')), error = 'by.x values [f, g] not present') # now only first no existing column is printed for efficiency
test(1364.22, setdiff_(X, Y[0L], by.x = 'a'),
data.table(a = c(1, 3, 2), b = factor(c(1L, 3L, 2L)),
c = c("1", "3", "2"), d = c(1L, 3L, 2L), e = c(1L, 5L, 7L)))
# not join along with by=.EACHI, #604
DT <- data.table(A=c(1,1,1,2,2,2,2,3,3,4,5,5))[, `:=`(B=as.integer(A), C=rep(c("c", "e", "a", "d"),3L), D=factor(rep(c("c", "e", "a", "d"),3L)), E=1:12)]
setkey(DT, A)
test(1365.1, DT[!J(c(2,5)), sum(E), by=.EACHI], DT[J(c(1,3,4)), sum(E), by=.EACHI])
setkey(DT, B)
test(1365.2, DT[!J(c(4:5)), list(.N, sum(E)), by=.EACHI], DT[J(1:3), list(.N, sum(E)), by=.EACHI])
setkey(DT, C)
test(1365.3, copy(DT)[!"c", f:=.N, by=.EACHI], copy(DT)[c("a","d","e"), f:=.N, by=.EACHI])
setkey(DT, D)
test(1365.4, DT[!J(factor("c")), .N, by=.EACHI], DT[J(factor(c("a","d","e"))), .N, by=.EACHI])
test(1365.5, DT[!"c", lapply(.SD, sum), by=.EACHI, .SDcols=c("B","E")], data.table(D=factor(c("a","d","e")), B=INT(8,10,7), E=INT(21,24,18), key="D"))
test(1365.6, DT[c("a","d","e"), lapply(.SD, sum), by=.EACHI, .SDcols=c("B", "E")], data.table(D=c("a","d","e"), B=INT(8,10,7), E=INT(21,24,18), key="D"))
# uniqlengths doesn't error on 0-length input
test(1366, uniqlengths(integer(0), 0L), integer(0))
# na.last=NA gets 0's for NAs not at the beginning when there are values so close to NA_integer_ for integers and -Inf for example for numerics. Moved logic to the end in forder.c so that we replace NAs with 0's after the ordering have been taken care of completely.
x = c(-2147483000L, NA_integer_, 1L)
test(1367.1, forderv(x, na.last=NA), c(0L,1L,3L))
x = c(NA, Inf, 0, 1, -1, -Inf, NaN)
test(1367.2, forderv(x, na.last=NA), c(0L, 0L, 6L, 5L, 3L, 4L, 2L))
# Fix for integer overflow segfault in setRange
x = c(-2147483647L, NA_integer_, 2L)
test(1367.3, forderv(x), c(2L, 1L, 3L))
x = c(2147483647L, NA_integer_, -2L)
test(1367.4, forderv(x), c(2L, 3L, 1L))
# tests for frankv. testing on vectors alone so that we can compare with base::rank
# no seed set on purpose
dt = data.table(AA=sample(c(-2:2), 50, TRUE),
BB=sample(c(-2,-1,0,1,2,Inf,-Inf), 50, TRUE),
CC=sample(c(letters[1:5]), 50, TRUE),
DD=sample(c(-2:2), 50, TRUE),
EE=sample(as.logical(c(-2:2)), 50, TRUE))
if (test_bit64) dt[, DD := as.integer64(DD)]
test_no = 1368.0
for (i in seq_along(dt)) {
col = dt[[i]]
for (j in list(TRUE, FALSE, "keep")) {
# ensure consistency with base::rank ties.methods as advertised
for (k in eval(formals(base::rank)$ties.method)) {
if (k == "random") set.seed(45L)
if (class(col) == "integer64") {
r1 = rank(as.integer(col), ties.method=k, na.last=j)
r2 = rank(-xtfrm(as.integer(col)), ties.method=k, na.last=j)
}
else {
r1 = rank(col, ties.method=k, na.last=j)
r2 = rank(-xtfrm(col), ties.method=k, na.last=j)
}
if (k == "random") set.seed(45L)
r3 = frankv(col, ties.method=k, na.last=j)
r4 = frankv(col, order=-1L, ties.method=k, na.last=j)
test_no = test_no+.0001
test(test_no, r1, r3)
test_no = test_no+.0001
test(test_no, r2, r4)
}
}
}
# test na.last=NA here separately.
dt = data.table(AA=sample(c(-2:2, NA), 50, TRUE),
BB=sample(c(-2,-1,0,1,2,Inf,-Inf, NA, NaN), 50, TRUE),
CC=sample(c(letters[1:5], NA), 50, TRUE),
DD=sample(c(-2:2, NA), 50, TRUE),
EE=sample(as.logical(c(-2:2, NA)), 50, TRUE))
if (test_bit64) dt[, DD := as.integer64(DD)]
test_no = 1369.0
for (i in seq_along(dt)) {
col = dt[[i]]
# ensure consistency with base::rank ties.methods as advertised
for (k in eval(formals(base::rank)$ties.method)) { # 'random' on now with tweak to match base in #4243
if (k == "random") set.seed(45L)
if (class(col) == "integer64") {
r1 = rank(as.integer(col), ties.method=k, na.last=NA)
r2 = rank(-xtfrm(as.integer(col)), ties.method=k, na.last=NA)
}
else {
r1 = rank(col, ties.method=k, na.last=NA)
r2 = rank(-xtfrm(col), ties.method=k, na.last=NA)
}
if (k == "random") set.seed(45L)
r3 = frankv(col, ties.method=k, na.last=NA)
r4 = frankv(col, order=-1L, ties.method=k, na.last=NA)
test_no = test_no+.0001
test(test_no, r1, r3)
test_no = test_no+.0001
test(test_no, r2, r4)
}
}
# tests for is_na, which is equivalent of rowSums(is.na(dt)) > 0L
# not exported yet, but we could!
## UPDATE: also added tests for "any_na", internal version of anyNA
## which also includes implementation for bit64::integer64, but the
## real need is for merging factors correctly in joins, and we need
## a fast check for NAs; can't rely on 3.1+ for anyNA.
dt = list(AA=sample(c(NA,-2:2), 50, TRUE),
BB=sample(c(NA,-2,-1,0,NaN,1,2,Inf,-Inf), 50, TRUE),
CC=sample(c(NA,letters[1:5]), 50, TRUE),
DD=sample(c(NA,-2:2), 50, TRUE),
EE=sample(as.logical(c(NA,-2:2)), 50, TRUE))
if (test_bit64) dt[["DD"]] = as.integer64(dt[["DD"]])
test_no = 1370.0
ans = as.list(na.omit(as.data.table(dt)))
for (i in seq_along(dt)) {
combn(names(dt), i, function(cols) {
ans1 = is_na(dt[cols])
ans2 = rowSums(is.na(as.data.table(dt[cols]))) > 0L
test_no <<- test_no+.0001
test(test_no, ans1, ans2)
# update: tests for any_na
test_no <<- test_no+.0001
test(test_no, any_na(dt[cols]), TRUE)
test_no <<- test_no+.0001
test(test_no, any_na(ans[cols]), FALSE)
TRUE
})
}
## The function is_na now gains a "by" argument where we can specify the columns. Tests have not been added for that yet.
## However, I've added tests for 'na.omit.data.table' that uses this internally. So we don't have to add tests here again.
## See tests 1394.*
# extensive testing of overlap joins:
# first test all argument check errors...
x = data.table(chr=c("Chr1", "Chr1", "Chr2", "Chr2", "Chr2"), start=c(5,10, 1, 25, 50), end=c(11,20,4,52,60))
y = data.table(chr=c("Chr1", "Chr1", "Chr2"), start=c(1, 15,1), end=c(4, 18, 55), val=1:3)
# no by.x and by.y error
test(1371.1, foverlaps(x, y, type="any"), error="'y' must be keyed (i.e., sorted, and, marked as sorted).")
setkey(y, chr, end, start)
test(1371.2, foverlaps(x, y, by.y=1:3, type="any"), error="The first 3 columns of y's key must be identical to the columns specified in by.y.")
setkey(y, chr, start, end)
setnames(y, c("chr", "pos1", "pos2", "val"))
setcolorder(y, c("chr", "val", "pos1", "pos2"))
test(1371.3, foverlaps(x,y,by.x=1:3, nomatch=0L), data.table(chr=x$chr[2:5], y[c(2,3,3,3), -1, with=FALSE], x[2:5, 2:3, with=FALSE]))
test(1371.4, foverlaps(x, y, type="any", by.x=c("chr", "start", "end"), by.y=c("chr", "pos1", "pos2"), which=TRUE, nomatch=0L),
data.table(xid=2:5, yid=INT(2,3,3,3)))
# 1371.4 same as :
# fo(gr(x), gr(y[, c(1,3,4), with=FALSE]), type="any", select="all")
# where :
# gr <- function(x) {
# GRanges(Rle(x[[1]]), IRanges(start=x[[2]], end=x[[3]]))
# }
# fo <- function(gr1, gr2, ...) {
# olaps = findOverlaps(gr1, gr2, ...)
# if (is.vector(olaps)) return(olaps)
# ans = setDT(list(xid=queryHits(olaps), yid=subjectHits(olaps)))
# setorder(ans)
# ans
# }
# ----- DONOT REMOVE THIS ----------------------------------------------------------------------
# CONTAINS CODE TO REGENERATE TEST1372.RData IF NECESSARY IN THE FUTURE
# cc(FALSE)
# require(GenomicRanges)
# gr <- function(x) {
# GRanges(Rle(x[[1]]), IRanges(start=x[[2]], end=x[[3]]))
# }
# fo <- function(gr1, gr2, ...) {
# olaps = findOverlaps(gr1, gr2, ...)
# if (is.vector(olaps)) return(olaps)
# ans = setDT(list(xid=queryHits(olaps), yid=subjectHits(olaps)))
# setorder(ans)
# ans
# }
# types=c("any", "within", "start", "end", "equal")
# mults=c("all", "first", "last")
# maxgap=-1L; minoverlap=0L
# verbose=FALSE; which=TRUE
# set.seed(123)
# this = 1L
# times = 3L
# ans = list() # vector("list", times*length(types)*length(mults))
# for (run in seq_len(times)) {
# n1 = max(50L, sample(1e2L, 1, FALSE))
# n2 = max(50L, sample(1e2L, 1, FALSE))
# N = max(100L, sample(1e3L, 1, FALSE))
# i1 = sample(N, n1, TRUE)
# i2 = sample(N, n1, TRUE)
# start = pmin(i1,i2)
# end = pmax(i1,i2)
# chr = sort(sample(paste("Chr", 1:2, sep=""), length(start), TRUE))
# i = setDT(list(chr=chr, start=start, end=end))
# i1 = sample(N, n2, TRUE)
# i2 = sample(N, n2, TRUE)
# start = pmin(i1,i2)
# end = pmax(i1,i2)
# chr = sort(sample(paste("Chr", 1:2, sep=""), length(start), TRUE))
# x = setDT(list(chr=chr, start=start, end=end))
# x_eq_idx <- sample(nrow(i), 4L)
# x = rbind(x, i[sample(x_eq_idx, 10L, TRUE)]) # for type='equal' matches
# setkey(x); setkey(i)
# for (type in types) {
# for (mult in mults) {
# idx <- paste(type, mult, run, sep="_")
# ans[[idx]] <- fo(gr(i), gr(x), type=type, select=mult)
# this = this+1L
# }
# }
# }
# save(ans, file="~/Dropbox/Public/github/Rdatatable/datatable/inst/tests/test1372-new.RData")
# ----------------------------------------------------------------------------------------------
types=c("any", "within", "start", "end", "equal") # add 'equal' as well
mults=c("all", "first", "last")
maxgap=-1L; minoverlap=0L # default has changed in IRanges/GenomicRanges :: findOverlaps
verbose=FALSE; which=TRUE
test_no = 1372.0
load(testDir("test1372.Rdata")) # Regenerated on 17/02/2019 to include type = 'equal'. Var 'ans' has all the results saved by running GenomicRanges separately using code above, is a list with names of the format type_mult_run
set.seed(123)
this = 1L
times = 3L
for (run in seq_len(times)) {
n1 = max(50L, sample(1e2L, 1, FALSE))
n2 = max(50L, sample(1e2L, 1, FALSE))
N = max(100L, sample(1e3L, 1, FALSE))
i1 = sample(N, n1, TRUE)
i2 = sample(N, n1, TRUE)
start = pmin(i1,i2)
end = pmax(i1,i2)
chr = sort(sample(paste("Chr", 1:2, sep=""), length(start), TRUE))
i = setDT(list(chr=chr, start=start, end=end))
i1 = sample(N, n2, TRUE)
i2 = sample(N, n2, TRUE)
start = pmin(i1,i2)
end = pmax(i1,i2)
chr = sort(sample(paste("Chr", 1:2, sep=""), length(start), TRUE))
x = setDT(list(chr=chr, start=start, end=end))
x_eq_idx <- sample(nrow(i), 4L)
x = rbind(x, i[sample(x_eq_idx, 10L, TRUE)]) # for type='equal' matches
setkey(x); setkey(i)
for (type in types) {
for (mult in mults) {
# data.table overlap join
nomatch = if(mult == "all") NULL else NA_integer_
thisans = foverlaps(i, x, mult=mult, type=type, nomatch=nomatch, which=which, verbose=verbose)
test_no = test_no+.01
# cat("test =", test_no, ", run = ", run, ", type = ", type, ", mult = ", mult, "\n", sep="")
idx = paste(type, mult, run, sep="_")
# ans[[idx]] contains fo(gr(i), gr(x), type=type, select=mult)
test(test_no, thisans, ans[[idx]])
this = this+1L
}
}
}
test(1372.91, load(testDir("test1372-1.Rdata")), c("i","x")) # 3432
test(1372.92, foverlaps(i, x, mult="all", type="equal", nomatch=NULL, which=TRUE),
data.table(xid=INT(11,35,35,45,45,45,46,46,46,46),
yid=INT(20,39,40,52,53,54,55,56,57,58)))
# fix for bug in address - #824
# was temporarily disabled in issue #2619
x = c(1L,5L,3L)
address(x) ## shouldn't increment NAM field
out = capture.output(.Internal(inspect(x)))
test(1373, grepl("(NAM|REF)\\(1\\)", out), TRUE) # 'NAM'<R4 'REF'>=R4; #4058
# fix for bug #762 - key'd data.table with a non-existing column in 'by' is not handled properly.
DT <- data.table(x=1:5, z=5:1, key="z")
y <- c(1,3,2,3,2)
test(1374.1, DT[, list(x=sum(x)), by=y], data.table(y=c(1,3,2), x=c(5L, 6L, 4L)))
y <- c(1,2,2,3,3)
test(1374.2, DT[, list(x=sum(x)), by=y], data.table(y=c(1,2,3), x=c(5L, 7L, 3L)))
# order in i combined with := in j, updates those rows in that order
# order in i without := in j, returns new object in that order, which is then updated
# Similarly, subset in i with := in j, updates that subset
DT = as.data.table(iris)
DT[,Species:=as.character(Species)]
test(1375.1, DT[,mean(Petal.Width),by=Species][order(-V1),Species:=toupper(Species)]$Species, c("SETOSA","VERSICOLOR","VIRGINICA"))
test(1375.2, DT[,mean(Petal.Width),by=Species][order(-V1)][,Species:=toupper(Species)]$Species, c("VIRGINICA","VERSICOLOR","SETOSA"))
test(1375.3, DT[,mean(Petal.Width),by=Species][V1>1,Species:=toupper(Species)]$Species, c("setosa","VERSICOLOR","VIRGINICA"))
# Secondary keys a.k.a indexes ...
DT = data.table(a=1:10,b=10:1)
test(1376.01, indices(DT), NULL)
test(1376.02, DT[b==7L,verbose=TRUE], DT[4L], output="Creating new index 'b'")
test(1376.03, indices(DT), "b")
test(1376.04, DT[b==8L,verbose=TRUE], DT[3L], output="Optimized subsetting with index 'b'")
test(1376.05, DT[a==7L,verbose=TRUE], DT[7L], output="Creating new index") # add 2nd secondary key
test(1376.06, indices(DT), c("b","a")) # 2 secondary keys of single columns
test(1376.07, DT[a==7L,verbose=TRUE], DT[7L], output="Optimized subsetting with index 'a'")
setkey(DT,b)
test(1376.08, indices(DT), NULL)
test(1376.09, list(DT[a==2L], indices(DT)), list(DT[9L],"a")) # create indices for next test
setindex(DT,NULL)
test(1376.10, list(key(DT), indices(DT)), list("b", NULL))
options(datatable.auto.index = FALSE)
test(1376.11, list(DT[a==2L], indices(DT)), list(DT[9L],NULL))
options(datatable.auto.index = TRUE)
test(1376.12, list(DT[a==2L], indices(DT)), list(DT[9L],"a"))
# When i is FALSE and a column is being added by reference, for consistency with cases when i is not FALSE
# we should still add the column. But we need to know what type it should be, so the user supplied RHS of :=
# needs to work on empty input to tell us the column type. Package vardpoor in example(vardchanges) used to
# rely on DT[FALSE,...] not adding the column and not evaluating RHS but it no longer does that so we can
# make this consistent now. If that usage is required then user should use if(FALSE) DT[...] instead.
DT = data.table(a=1:3, b=4:6)
ans = copy(DT)[, foo:=NA_real_]
test(1377.1, copy(DT)[FALSE, foo:=7], ans)
test(1377.2, copy(DT)[0, foo:=7], ans)
test(1377.3, copy(DT)[, foo := Reduce(function(x,y)paste(x,y,sep="__"), .SD), .SDcols=c("a","b")],
data.table(a=1:3, b=4:6, foo=c("1__4","2__5","3__6")))
err = "Some items of .SDcols are not column names"
# .SDcols should always be checked even if RHS (which uses .SDcols) isn't eval'd due to i==FALSE
test(1377.4, copy(DT)[, bar := Reduce(function(x,y)paste(x,y,sep="__"), .SD), .SDcols=c("a","zz")],
error=err)
test(1377.5, copy(DT)[FALSE, bar := Reduce(function(x,y)paste(x,y,sep="__"), .SD), .SDcols=c("a","zz")],
error=err)
test(1377.6, DT, data.table(a=1:3, b=4:6)) # check that the original hasn't been changed by these tests
test(1377.7, copy(DT)[FALSE, bar:=stop("eval'd")], error="eval'd")
DT[,bar:=NA] # create column so that RHS isn't needed to be eval'd to know type. We don't allow type changes anyway.
# Now no need to eval RHS (and therefore find error), as relied on by package treemap
# in example(random.hierarchical.data) in the do.call of fun=="addRange" where it's called on
# an empty subset and LB <- x[[1]][1] results in NA which causes seq(LB, UB, ...) to error.
test(1377.8, copy(DT)[FALSE, bar:=stop("eval'd")], DT)
#====================================
# fread issue with http download on Windows, thanks to Steve Miller for highlighting.
# any file would do but this one is http://www.russell.com/common/indexes/csvs/russellmicrocapvalueindex_hist.csv
# it happens to have a \r embedded in the first (quoted) column as well but that's not the issue
# can't pass in the http: address directly because this runs on CRAN and any http: site might be unavailable
# therefore, this doesn't actually test mode="wb" but close as we can get
# NB: As of v1.10.5, fread copes ok with any number of \r before the \n
test(1378.1, fread(file=testDir("russellCRLF.csv"))[19,`Value With Dividends`], 357.97)
f = paste0("file://",testDir("russellCRLF.csv"))
# simulates a http:// request as far as file.download() and unlink() goes, without internet
# download.file() in fread() changes the input data from \r\n to \n, on Windows.
test(1378.2, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97)
f = paste("file://",testDir("russellCRCRLF.csv"),sep="")
# actually has 3 \r in the file, download.file() from file:// changes that to \r\r\n, so we can simulate download.file from http: in text mode.
test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97)
#====================================
options(datatable.fread.datatable = FALSE)
test(1379.1, fread("A,B\n1,3\n2,4\n"), data.frame(A=1:2,B=3:4))
test(1379.2, fread("A,B\n1,3\n2,4\n",data.table=TRUE), data.table(A=1:2,B=3:4))
options(datatable.fread.datatable = TRUE)
test(1379.3, fread("A,B\n1,3\n2,4\n",data.table=FALSE), data.frame(A=1:2,B=3:4))
options(datatable.fread.datatable = TRUE)
# That that RHS of == is coerced to x's type before bmerge in auto index. Package vardpoor does this in example(linqsr)
DT = data.table(a=c(0,0,1,1,0,0), b=1:6) # 'a' type double here, as it is in vardpoor
test(1380, DT[a==TRUE], DT[3:4])
# Fix #847, as.data.table.list and character(0) issue
x <- data.table(a=character(0), b=character(0), c=numeric(0))
setkey(x, a, b)
test(1381, x[J("foo", character(0)), nomatch=0L], x, warning="Item 2 has 0 rows but longest item has 1; filled with NA")
# Fix for #813 and #758
DT = data.table(x = 1:2)
test(1382.1, DT[c(FALSE, FALSE), list(x, 3:4)], data.table(x=integer(0), V2=integer(0)))
DT <- data.table(id = c("a", "a", "b", "b"), var = c(1.1, 2.5, 6.3, 4.5), key="id")
test(1382.2, DT["c", list(id, check = any(var > 3)), nomatch=0L], data.table(id=character(0), check=logical(0), key="id"))
test(1382.3, DT[c(FALSE), id], character(0))
DT <- DT[1:3]; setkey(DT, id)
test(1382.4, DT[c("c", "b"), list(id, check = any(var > 3)), nomatch=0L], data.table(id="b", check=TRUE, key="id"))
# Fix for #742 - allow.cartesian should be ignored if `i` has no duplicates.
DT <- data.table(id=rep(letters[1:2], 2), var = rnorm(4), key="id")
test(1383.1, DT[letters[1:3], list(var)], DT[1:5, list(var)])
# Fix for #800 - allow.cartesian should be ignored if jsub[1L] has `:=`.
DT=data.table(id=c(1,1), date=c(1992,1991), value=c(4.1,4.5), key="id")
test(1383.2, copy(DT)[DT, a:=1], DT[, a := 1])
# Somehow DT[col==max(col)] was never tested, broken by auto-indexing new in v1.9.4, #858
DT = data.table(a = c(1,1,1,2,2,2,3,3,3), b = rnorm(9))
test(1384, DT[a == max(a)], DT[7:9])
# Dups on RHS of == or %in%
DT = data.table(id = paste("id",1:5,sep=""))
id.sub = c("id1", "id2", "id3", "id3", "id4") # deliberate dup
test(1385.1, DT[id %in% id.sub], DT[1:4])
test(1385.2, DT[id == id.sub], DT[1:3])
# reserved class attributes conflict with auto index names, #
DT = data.table(class=c('a','b'), x=c(1,2))
test(1386, DT[class=='a'], DT[1])
# Fix for #774 - parsing a$b() in 'j'
DT = data.table(x=1:5, y=6:10)
ll = list(foo = function() 1L)
test(1387.1, copy(DT)[, z := ll$foo()], copy(DT)[, z:=1L])
test(1387.2, copy(DT)[, z := ll[[1L]]()], copy(DT)[, z:=1L])
# Fix for #811 - ITime and negative integers formats wrong result.
x = c(1L, -1L, -3700L)
class(x) = "ITime"
test(1388, as.character(x), c("00:00:01", "-00:00:01", "-01:01:40"))
# Fix for #880. Another eval(parse(.)) issue.
DT <- as.data.table(iris)
DT[, foo := "Species"]
test(1389, copy(DT)[,bar := eval(parse(text=foo[1]), envir=.SD)], copy(DT)[, bar := Species])
# Fix for foverlaps() floating point interval (double) types. Should increment them by machine tolerance, not by 1L
DT1 = data.table(start=c(0.88), end=c(0.88))
DT2 = data.table(start=c(0.26, 0.5, 0.55, 0.7), end=c(0.61, 0.88, 0.88-.Machine$double.eps^0.5, 0.89))
setkey(DT2)
test(1390.1, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1L, yid=c(2L, 4L)))
DT1 = data.table(start=c(0.3,0.5), end=c(0.3,0.5))
DT2 = data.table(start=c(0.4), end=c(0.4))
setkey(DT2)
test(1390.2, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1:2, yid=as.integer(c(NA, NA))))
tt = c( as.POSIXct('2011-10-11 07:49:36'), as.POSIXct('2011-10-11 07:49:37'))
DT1 = data.table(start=tt, end=tt)
DT2 = data.table(start=tt[1], end=tt[1])
setkey(DT2)
test(1390.3, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1:2, yid=as.integer(c(1L, NA))))
tt = c( as.POSIXct('2011-10-11 07:49:36.3'), as.POSIXct('2011-10-11 07:49:37.4'), as.POSIXct('2011-10-11 07:49:37.5'))
DT1 = data.table(start=tt, end=tt)
DT2 = data.table(start=tt[2], end=tt[2])
setkey(DT2)
test(1390.4, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1:3, yid=as.integer(c(NA, 1L, NA))))
tt = c( as.POSIXct('2011-10-11 07:49:36.0003'), as.POSIXct('2011-10-11 07:49:36.0199'), as.POSIXct('2011-10-11 07:49:36.0399'))
DT1 = data.table(start=tt, end=tt)
DT2 = data.table(start=tt[2], end=tt[2])
setkey(DT2)
test(1390.5, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1:3, yid=as.integer(c(NA, 1, NA))))
# Fix for #891. 'subset' and duplicate names.
# duplicate column names rule - if column numbers, extract the right column. If names, extract always the first column
DT = data.table(V1=1:5, V2=6:10, V3=11:15)
setnames(DT, c("V1", "V2", "V1"))
test(1391.1, subset(DT, select=c(3L,2L)), DT[, c(3L, 2L), with=FALSE])
test(1391.2, subset(DT, select=c("V2", "V1")), DT[, c("V2", "V1"), with=FALSE])
# Test faster version of na.omit() using is_na.
DT = data.table(x=sample(c(1:2, NA), 30, TRUE), y=sample(c(1:5, NA, NaN), 30, TRUE))
test(1392.1, na.omit(DT), DT[!is.na(x) & !is.na(y)])
# added 'invert = ', a logical argument which when TRUE returns rows that has any NAs instead.
test(1392.2, na.omit(DT, invert=TRUE), DT[is.na(x) | is.na(y)])
# Fix for #899. Mix of ordered and normal factors where normal factors in more than 1 data.table has identical levels.
DT1 = data.table(A = factor(INT(7,8,7,8,7)), B = factor(6:10), C = 0)
DT2 = data.table(D = ordered(1:5), A = factor(INT(1:2,1:2,1L)), C = 0)
DT3 = data.table(A = factor(INT(7:8)), C = 0)
ans = data.table(A=factor(INT(7,8,7,8,7,1,2,1,2,1,7,8), levels=c("7", "8", "1", "2")), B=factor(INT(6:10, rep(NA,7))), C=0, D=ordered(INT(rep(NA,5), 1:5, rep(NA,2))))
test(1393.1, rbindlist(list(DT1, DT2, DT3), fill = TRUE), ans)
# test for #591 (R-Forge #2491)
ans[, ID := rep(1:3, c(5,5,2))]
setcolorder(ans, c("ID", LETTERS[1:4]))
test(1393.2, rbindlist(list(DT1, DT2, DT3), fill = TRUE, idcol="ID"), ans)
# Tests for na.omit.data.table (faster version + with a 'cols=' new argument)
col = c(1:2, NA_integer_)
DT = data.table(a=sample(col, 20, TRUE), b=as.numeric(sample(col,20,TRUE)), c=as.logical(sample(col,20,TRUE)), d=as.character(sample(col,20,TRUE)))
# can't use complete.cases on bit64... will have to test integer64 separately.
# if (test_bit64) {
# DT[, e := as.integer64(sample(col,20,TRUE))]
# }
test_no = 1394
for (i in seq_along(DT)) {
combn(names(DT), i, function(cols) {
ans1 = na.omit(DT, cols=cols)
ans2 = DT[complete.cases(DT[, cols, with=FALSE])]
test_no <<- test_no+.001
test(test_no, ans1, ans2)
0L
})
}
# dropping secondary keys on update or delete
DT = data.table(a=1:3, b=4:6)
test(1396, DT[a==2, verbose=TRUE], DT[2], output="Creating new index 'a'")
test(1397, DT[b==6, verbose=TRUE], DT[3], output="Creating new index 'b'")
test(1398, DT[b==6, verbose=TRUE], DT[3], output="Optimized subsetting with index 'b'")
test(1399, indices(DT), c("a","b"))
test(1400, DT[2, a:=4L, verbose=TRUE], data.table(a=c(1L,4L,3L),b=4:6), output=".*Dropping index 'a' due to an update on a key column")
test(1401, indices(DT), "b")
test(1402, DT[,b:=NULL,verbose=TRUE], data.table(a=c(1L,4L,3L)), output=".*Dropping index 'b' due to an update on a key column")
test(1403, indices(DT), NULL)
DT = data.table(x=1:5)
test(1404, DT[, y := x <= 2L], data.table(x=1:5, y=c(TRUE,TRUE,FALSE,FALSE,FALSE)))
test(1405, DT[y == TRUE, .N, verbose=TRUE], 2L, output="Creating new index")
test(1406, DT[, y := x <= 3L, verbose=TRUE], data.table(x=1:5, y=c(TRUE,TRUE,TRUE,FALSE,FALSE)), output=".*Dropping index")
test(1407, DT[y == TRUE, .N], 3L)
DT = data.table(x=1:5, y=10:6)
test(1408, DT[x==3,verbose=TRUE], DT[3], output="Creating")
test(1409, indices(DT), "x")
set(DT,1:3,1L,-10L)
test(1410, indices(DT), NULL)
test(1411, DT[x==5], DT[5])
setorder(DT, y)
test(1412, indices(DT), NULL)
test(1413, DT[x==5], DT[1])
DT = data.table(foo=1:3, bar=4:6, baz=9:7)
setindex(DT,foo,bar,baz)
test(1414, indices(DT), c("foo__bar__baz"))
test(1415, DT[2,bar:=10L,verbose=TRUE], output=".*Shortening index 'foo__bar__baz' to 'foo' due to an update on a key column") # test middle
test(1416, indices(DT), 'foo')
setindex(DT,foo,bar,baz)
test(1417, DT[2,baz:=10L,verbose=TRUE], output=".*Shortening index 'foo__bar__baz' to 'foo__bar' due to an update on a key column") # test last
setindex(DT,bar,baz)
test(1418.1, DT[2,c("foo","bar"):=10L,verbose=TRUE], output=".*Dropping index.* due to an update on a key column") # test 2nd to 1st
setindex(DT,bar,baz)
test(1418.2, DT[2,c("foo","baz"):=10L,verbose=TRUE], output=".*Dropping index 'bar__baz' due to an update on a key column") # test 2nd to 2nd
## testing key retainment on assign (#2372)
DT <- data.table(x1 = c(1,1,1,1,1,2,2,2,2,2),
x2 = c(1,1,2,2,2,1,1,2,2,2),
x3 = c(1,2,1,1,2,1,1,1,1,2),
y = rnorm(10),
key = c("x1", "x2", "x3"))
thisDT <- copy(DT)[2, x1 := 3]
test(1419.01, key(thisDT), NULL)
thisDT <- copy(DT)[2, x2 := 3]
test(1419.02, key(thisDT), "x1")
test(1419.03, forderv(thisDT, c("x1")), integer(0))
thisDT <- copy(DT)[2, x2 := 3]
test(1419.04, key(thisDT), "x1")
test(1419.05, forderv(thisDT, c("x1")), integer(0))
thisDT <- copy(DT)[3, x3 := 3]
test(1419.06, key(thisDT), c("x1", "x2"))
test(1419.07, forderv(thisDT, c("x1", "x2")), integer(0))
thisDT <- copy(DT)[3, c("x1", "x3") := .(3,3)]
test(1419.08,key(thisDT), NULL)
thisDT <- copy(DT)[3, c("x2", "x3") := .(3,3)]
test(1419.09, key(thisDT), "x1")
# skip test numbers ending 0 because if 1419.10 fails, it prints as 1419.1 the same as 1419.1
test(1419.10, forderv(thisDT, c("x1")), integer(0))
setkey(DT, NULL)
thisDT <- copy(DT)[3, x3 := 3]
test(1419.11, key(thisDT), NULL)
## same tests for empty DT
## forderv tests can be skipped for empty DT
DT <- DT[0]
thisDT <- copy(DT)[, x3 := 3]
test(1419.12, key(thisDT), NULL)
setkeyv(DT, c("x1", "x2", "x3"))
thisDT <- copy(DT)[, x1 := 3]
test(1419.13, key(thisDT), NULL)
thisDT <- copy(DT)[, x2 := 3]
test(1419.14, key(thisDT), "x1")
thisDT <- copy(DT)[, x2 := 3]
test(1419.15, key(thisDT), "x1")
thisDT <- copy(DT)[, x3 := 3]
test(1419.16, key(thisDT), c("x1", "x2"))
thisDT <- copy(DT)[, c("x1", "x3") := .(3,3)]
test(1419.17, key(thisDT), NULL)
thisDT <- copy(DT)[, c("x2", "x3") := .(3,3)]
test(1419.18, key(thisDT), "x1")
## testing secondary index retainment on assign (#2372)
allIndicesValid <- function(DT){
## checks that the order of all indices are correct
for(idx in seq_along(indices(DT))){
index <- attr(attr(DT, "index"), paste0("__", indices(DT)[idx], collapse = ""))
if(!length(index)) index <- seq_len(nrow(DT))
if(length(forderv(DT[index], indices(DT, vectors = TRUE)[[idx]]))){
## index is not properly ordered
return(FALSE)
}
if(any(duplicated(names(attributes(attr(DT, "index")))))){
## index names are not unique
return(FALSE)
}
}
return(TRUE)
}
## on data.table where indices are not integer(0)
DT <- data.table(a = c(1,1,1,2,1,2,2,2,2,2),
aaa = c(2,1,2,2,2,1,1,2,2,2),
b = c(1,2,1,1,2,1,1,1,1,2),
ab = rnorm(10))
test(1419.21, indices(copy(DT)[1, a:=1]), NULL)
setindex(DT, a)
setindex(DT, a, aaa)
setindex(DT, ab, aaa)
setindex(DT)
test(1419.22, allIndicesValid(DT), TRUE)
thisDT <- copy(DT)[1, a:=1][, aaa := 1][, ab := 1]
test(1419.23, indices(thisDT), NULL)
test(1419.24, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, b := 2]
test(1419.25, indices(thisDT), c("a", "a__aaa", "ab__aaa"))
test(1419.26, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, ab := 2]
test(1419.27, indices(thisDT), c("a", "a__aaa"))
test(1419.28, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, aaa := 2]
test(1419.29, indices(thisDT), c("a"))
test(1419.31, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, c("aaa", "b") := 2]
test(1419.32, indices(thisDT), c("a"))
test(1419.33, allIndicesValid(thisDT), TRUE)
## on data.table where indices are integer(0)
DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2),
aaa = c(1,1,2,2,2,1,1,2,2,2),
b = c(1,2,1,2,3,1,2,1,2,3),
ab = 1:10)
test(1419.34, indices(copy(DT)[1, a:=1]), NULL)
setindex(DT, a)
setindex(DT, a, aaa)
setindex(DT, ab, aaa)
setindex(DT)
test(1419.35, allIndicesValid(DT), TRUE)
thisDT <- copy(DT)[1, a:=1][, aaa := 1][, ab := 1]
test(1419.36, indices(thisDT), NULL)
test(1419.37, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, b := 2]
test(1419.38, indices(thisDT), c("a", "a__aaa", "ab__aaa"))
test(1419.39, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, ab := 2]
test(1419.41, indices(thisDT), c("a", "a__aaa", "a__aaa__b"))
test(1419.42, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, aaa := 2]
test(1419.43, indices(thisDT), c("a", "ab"))
test(1419.44, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, c("aaa", "b") := 2]
test(1419.45, indices(thisDT), c("a", "ab"))
test(1419.46, allIndicesValid(thisDT), TRUE)
## on empty DT
DT <- DT[0]
setindex(DT, NULL)
test(1419.47, indices(copy(DT)[, a:=1]), NULL)
setindex(DT, a)
setindex(DT, a, aaa)
setindex(DT, ab, aaa)
setindex(DT)
test(1419.48, allIndicesValid(DT), TRUE)
thisDT <- copy(DT)[, a:=1][, aaa := 1][, ab := 1]
test(1419.49, indices(thisDT), NULL)
test(1419.51, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, b := 2]
test(1419.52, indices(thisDT), c("a", "a__aaa", "ab__aaa"))
test(1419.53, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, ab := 2]
test(1419.54, indices(thisDT), c("a", "a__aaa", "a__aaa__b"))
test(1419.55, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, aaa := 2]
test(1419.56, indices(thisDT), c("a", "ab"))
test(1419.57, allIndicesValid(thisDT), TRUE)
thisDT <- copy(DT)[, c("aaa", "b") := 2]
test(1419.58, indices(thisDT), c("a", "ab"))
test(1419.59, allIndicesValid(thisDT), TRUE)
## setkey on same col as existing index, #2889
DT <- data.table(a =c(4,1,3,9,2,1,8,7,6,5),
aaa = c(1,1,2,2,2,1,1,2,2,2))
setindex(DT, a)
test(1419.60, allIndicesValid(DT), TRUE)
setindex(DT, NULL)
setkey(DT, a)
test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9))
setkey(DT, NULL)
setindex(DT, a)
test(1419.62, setkey(DT, a, verbose=TRUE), data.table(a=c(1,1:9), aaa=c(1,1,2,2,1,2,2,2,1,2), key="a"),
output="setkey on columns [a] using existing index 'a'") # checks also that the prior index a is dropped (because y is keyed with no index)
# setkey picks correct index of multiple indexes (e.g. exact=TRUE is used in internals)
DT = data.table(a = c(3,3,4,4,5,6,1,1,7,2),
aaa = c(1,1,2,2,2,1,1,2,2,2),
bbb = c(1,1,2,0,1,1,1,0,1,1))
setkey(DT, a)
test(1419.63, DT$a, c(1,1,2,3,3,4,4,5,6,7))
setkey(DT, NULL)
test(1419.64, setkey(DT, a, verbose=TRUE), output="forder took")
setkey(DT, NULL)
setindex(DT, aaa, a)
setindex(DT, aaa) # this aaa not previous aaa_a should be used by setkey(DT,aaa); i.e. ensure no partial matching
test(1419.65, allIndicesValid(DT), TRUE)
test(1419.66, setkey(DT, aaa, verbose=TRUE), data.table(a=c(1,3,3,6,1,2,4,4,5,7), aaa=c(1,1,1,1,2,2,2,2,2,2), bbb=c(1,1,1,1,0,1,2,0,1,1), key="aaa"),
output="setkey on columns [aaa] using existing index 'aaa'") # checks that all indexes are dropped (aaa_a too)
# setnames updates secondary key
DT = data.table(a=1:5,b=10:6)
setindex(DT,b)
test(1420, indices(DT), "b")
setnames(DT,"b","foo")
test(1421, indices(DT), "foo")
test(1422, DT[foo==9, verbose=TRUE], DT[2], output="Optimized subsetting with index 'foo'")
setindex(DT,a,foo)
test(1423, indices(DT), c("foo","a__foo")) # tests as well that order of attributes is retained although we don't use that property currently.
test(1424, indices(setnames(DT,"foo","bar")), c("bar","a__bar"))
test(1425, indices(setnames(DT,"a","baz")), c("bar","baz__bar"))
test(1426, DT[baz==4L, verbose=TRUE], output="Creating new index 'baz'")
test(1427, indices(DT), c("bar","baz__bar", "baz"))
test(1428, DT[bar==9L, verbose=TRUE], output="Optimized subsetting with index 'bar'")
test(1429, indices(setnames(DT,"bar","a")), c("baz", "a", "baz__a"))
# Finalised == and %in% optimization in i
DT = data.table(a=1:3,b=c(0,2,3,0,0,2))
test(1430, DT[a==1:2], error="RHS of == is length 2 which is not 1 or nrow (6). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %in% instead.")
test(1431, DT[a %in% 1:2], DT[c(1,2,4,5)])
test(1432, DT[a==b], DT[2:3])
test(1433, DT[a %in% b], DT[c(2,3,5,6)])
test(1434, DT[a==b+1], DT[c(1,4,6)])
test(1435, DT[b==max(a)], DT[3])
test(1436, DT[a==2,verbose=TRUE], DT[c(2,5)], output="Coercing double column i.a (which contains no fractions) to type integer to match type of x.a")
DT[,a:=factor(letters[a])]
test(1437.01, DT[a==factor("b"),verbose=TRUE], DT[c(2,5)], output="Creating new index 'a'")
## test that the lookup env for RHS is correct. In internal env,notjoin is FALSE in this case.
notjoin <- TRUE
DT <- data.table(x = TRUE)
test(1437.02, DT[x == notjoin], DT)
## column names 'sorted' and 'unique' could cause problems because of CJ with do.call in .prepareFastSubset
DT <- data.table(sorted = TRUE, unique = FALSE)
test(1437.03, DT[sorted == TRUE & unique == FALSE], DT)
## test no reordering in groups
DT <- data.table(x = c(1,1,1,2), y = c(3,2,1,1))
setindex(DT, x, y)
test(1437.04, DT[x==1], setindex(data.table(x = c(1,1,1), y = c(3,2,1)), x,y))
## test that key order makes no difference
DT <- data.table(x = c(2,1,1,1), y = c(3,2,1,1), z = c(1,1,2,2))
test(1437.05, setkey(setkey(DT, x,y,z)[x==1&y==1&z==2], NULL),setkey(setkey(DT, y,x,z)[x==1&y==1&z==2], NULL))
setkey(DT, NULL)
setorder(DT, -x, -y, z)
DT2 <- copy(DT)
setindex(DT, x,y,z)
setindex(DT2, z,y,x)
test(1437.06, setindex(DT[x==1&y==1&z==2], NULL),setindex(DT2[x==1&y==1&z==2], NULL))
## test that query order makes no difference
set.seed(1)
DT <- data.table(x = c(1,1,1,2,2,2), y = c(1,1,2,2,3,3), z = rnorm(6))
test(1437.07, copy(DT)[x==2 & y==3], copy(DT)[y==3 & x==2])
## test that optimization really takes place for the supportd operators, also when connected with &
test(1437.08, DT[x==2, verbose = TRUE], output = "Optimized subsetting")
test(1437.09, DT[x %in% c(2,3), verbose = TRUE], output = "Optimized subsetting")
DT[, a:= c("A", "Q", "W", "C", "X", "Q")]
test(1437.10, DT[a %chin% c("A", "B"), verbose = TRUE], output = "Optimized subsetting")
test(1437.11, DT[a %chin% c("A", "B") & x == 3 & y %in% c(1,2), verbose = TRUE], output = "Optimized subsetting")
## multiple selections on the same column are not optimized and yield correct result
test(1437.12, DT[a %chin% c("A", "B") & a == "A", verbose = TRUE], output = "^ x y z a\n1: 1 1 -0.6264538 A")
test(1437.13, DT[a %chin% c("A", "B") & a == "C"], DT[0])
## queries with 'or' connection are not optimized and yield the correct result.
test(1437.14, DT[a %chin% c("A", "B") | x == 2, verbose = TRUE], output = "^ x y z a\n1: 1 1 -0.6264538 A\n2: 2 2 1.5952808 C\n3: 2 3 0.3295078 X\n4: 2 3 -0.8204684 Q")
test(1437.15, DT[a %chin% c("A", "B") | x == 2], DT[c(1, 4, 5, 6)])
## notjoin queries with connection are not optimized and yield the correct result.
test(1437.16, DT[!a %chin% c("A", "B") & x == 2, verbose = TRUE], output = "^ x y z a\n1: 2 2 1.5952808 C\n2: 2 3 0.3295078 X\n3: 2 3 -0.8204684 Q")
test(1437.17, DT[!a %chin% c("A", "B") & x == 2], DT[c(4, 5, 6)])
## queries with j are optimized (Correct results are tested extensively below)
test(1437.18, DT[x == 2, .(test = x+y), verbose = TRUE], output = "Optimized subsetting")
test(1437.19, DT[x == 2, test := x+y, verbose = TRUE], output = "Optimized subsetting")
## optimize option level 3 is required to get optimized subsetting
options(datatable.optimize = 2L)
test(1437.21, DT[x == 2, verbose = TRUE], output = "^ x y")
options(datatable.optimize = Inf)
test(1437.22, DT[x == 2, verbose = TRUE], output = "Optimized subsetting")
## NaN on right hand side is treated correctly. NA on right hand side is not reaching .prepareFastSubset, so not tested here
DT <- data.table(x = c(1L:10L, NA_integer_, NA_integer_), y = c(1:10, NA_real_, NaN))
test(1437.23, DT[y == NaN], DT[0])
test(1437.24, DT[y == NA], DT[0])
test(1437.25, DT[x == 5 & y == NaN], DT[0])
## notjoin treats NA's correctly
test(1437.26, DT[!x == 3], DT[-c(3,11,12)])
## column comparisons are not affected
DT <- data.table(x = c(1:10, 1:10, 1:10), y = c(1:20, -1:-10))
test(1437.27, DT[x == y], DT[1:10])
## subsets with logical vectors are optimized correctly.
DT <- data.table(x = c(TRUE, TRUE, TRUE, FALSE), y = c(FALSE, TRUE, FALSE, TRUE))
test(1437.28, DT[x & y, verbose = TRUE], output = "Optimized subsetting")
test(1437.29, DT[x & y], DT[2])
## subsets where just cols are given but columns are not logical are not optimized.
DT <- data.table(x = c(1, 0, 2, 4), y = c(0, 2, 1, 3))
test(1437.31, DT[x & y, verbose = TRUE], output = "^ x y")
test(1437.32, DT[x & y], DT[3:4])
## test that optimization is switched off if CJ would result in more than 1e4 rows of i (#2635)
# single column query with more than 1e4 rows is optimized
test(1437.33, DT[x %in% 0:1e5, verbose = TRUE], output = "Optimized subsetting")
test(1437.34, DT[x %in% 0:1e5], DT)
# multi column query with less than 1e4 rows in i is optimized
test(1437.35, DT[x %in% 0:99 & y %in% 0:98, verbose = TRUE], output = "Optimized subsetting")
test(1437.36, DT[x %in% 0:99 & y %in% 0:98], DT)
# multi column query with more than 1e4 rows in i is not optimized
test(1437.37, DT[x %in% 0:100 & y %in% 0:101, verbose = TRUE], output = "Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.")
test(1437.38, DT[x %in% 0:100 & y %in% 0:101], DT)
## do extensive tests of optimized vs non-optimized queries for identical results.
## very much inspired by the tests for non equi joins (1641ff)
set.seed(13545)
n <- 1e3 ## rows of the data.table. I checked that with these rows and the seed, all queries (except for == NA) have at least one match
## create test DT
DT <- data.table(intCol = sample(c(1:3, NA), n, replace = TRUE),
doubleCol = sample(c(1.86, 1000.345, 2.346, NA, NaN), n, replace = TRUE),
boolCol = sample(c(TRUE, FALSE, NA), n, replace = TRUE),
charCol = sample(c(LETTERS[1:3], NA), n, replace = TRUE),
groupCol = sample(c("a", "b", "c"), n, replace = TRUE),
sortedGroupCol = c(rep(1L, floor(n/3)), rep(2L, floor(n/3)), rep(3L, ceiling(n/3)))) ## sorted grouping column is important to test #2713
if (test_bit64) DT[, int64Col := as.integer64(sample(1:3, n, replace = TRUE))]
## get list with unique values excluding NA
vals <- lapply(DT, function(x) {out <- unique(x); out[!is.na(out)]})
## define possible queries for each column.
queries <- list(intCol = c(paste0("intCol == ", sample(vals$intCol, 1)),
paste0("intCol %in% c(", paste0(c(NA,sample(vals$intCol, 3, replace = TRUE)), collapse = ","), ")")),
doubleCol = c(paste0("doubleCol == ", sample(vals$doubleCol, 1)),
paste0("doubleCol %in% c(NA, NaN, ", paste0(sample(vals$doubleCol, 3, replace = TRUE), collapse = ","), ")")),
boolCol = c(paste0("boolCol == ", sample(vals$boolCol, 1))), ## %in% query makes no sense for bools, therefore not tested
charCol = c(paste0("charCol == '", sample(vals$charCol, 1), "'"),
paste0("charCol %in% c(NA, ", paste0("'", sample(vals$charCol, 3, replace = TRUE), "'", collapse = ","), ")"),
paste0("charCol %chin% c(NA, ", paste0("'", sample(vals$charCol, 3, replace = TRUE), "'", collapse = ","), ")"))
)
if (test_bit64)
queries$int64Col = c(paste0("int64Col == ", sample(vals$int64Col, 1)),
paste0("int64Col %in% c(NA_integer64_, as.integer64(", paste0(sample(vals$int64Col, 3, replace = TRUE), collapse = ","), "))"))
## create all combinations of up to three queries, connected by "&".
## Includes all combinations, also with a subset of columns, but no permutations
all <- data.table(intCol = character(0),
doubleCol = character(0),
boolCol = character(0),
charCol = character(0),
int64Col = character(0))
for(thisLength in 1:3){
## get all query combinations with up to three columns
combs <- as.list(as.data.table(combn(length(queries),thisLength)))
for(comb in combs) all <- rbind(all, do.call(CJ, queries[comb]), fill = TRUE)
}
all[is.na(all)] <- "missing"
## construct the query string in i
all[, query := paste(intCol, doubleCol, boolCol, charCol, int64Col, sep = "&")]
all[, query := gsub("&missing$", "", gsub("missing&", "", query))]
## define examplary test queries for j
jQueries <- c(".(test = intCol + doubleCol, test2 = paste0(boolCol, charCol))",
"c('test1', 'test2') := list(pmax(intCol, doubleCol), !boolCol)")
## define example 'by' values
bys <- c("groupCol", "sortedGroupCol", character(0))
## test each query string
test_no <- 1438.0000
if (.Machine$sizeof.pointer>4) { # temporarily disabled for 32bit, #2767
for(t in seq_len(nrow(all))){
## test the query with missing j
thisQuery <- all$query[t]
options("datatable.optimize" = 3L)
ansOpt <- DT[eval(parse(text = thisQuery))]
options("datatable.optimize" = 2L)
ansRef <- DT[eval(parse(text = thisQuery))]
test_no <- test_no + 0.0001
test(test_no, ansOpt, ansRef)
## repeat the test with 'which = TRUE'
options("datatable.optimize" = 3L)
ansOpt <- DT[eval(parse(text = thisQuery)), which = TRUE]
options("datatable.optimize" = 2L)
ansRef <- DT[eval(parse(text = thisQuery)), which = TRUE]
test_no <- test_no + 0.0001
test(test_no, ansOpt, ansRef)
## repeat the test with the j queries
for(thisJquery in jQueries) {
## do it with and without existing "by"
for(thisBy in bys){
options("datatable.optimize" = 3L)
ansOpt <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]
options("datatable.optimize" = 2L)
ansRef <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]
test_no <- test_no + 0.0001
test(test_no, ansOpt, ansRef)
}
}
}
}
options(datatable.optimize = Inf)
# fread dec=',' e.g. France
test(1439, fread("A;B\n1;2,34\n", dec="12"), error="nchar(dec) == 1L is not TRUE")
test(1440, fread("A;B\n8;2,34\n", dec="1"), data.table(A=8L, B="2,34"))
test(1441, fread("A;B\n8;2,34\n", dec=","), data.table(A=8L, B=2.34))
test(1442, fread("A;B\n1;2,34\n", sep=".", dec="."), error="sep == dec ('.') is not allowed")
test(1443, fread("A;B\n1;2,34\n", dec=",", sep=","), error="sep == dec (',') is not allowed")
# sep=".", issue #502
input = paste( paste("192.168.4.", 1:10, sep=""), collapse="\n")
test(1444.1, fread(input, sep=".", dec="*"), ans<-data.table(V1=192L,V2=168L,V3=4L,V4=1:10))
test(1444.2, fread(input, sep=".", dec=","), ans)
test(1444.3, fread(paste(paste("192. 168. 4. ", 1:10, sep = ""), collapse="\n"), sep=".", dec=","), ans)
test(1444.4, fread(paste(paste("Hz.BB.GHG.", 1:10, sep = ""), collapse="\n"), sep=".", dec=","),
data.table(V1="Hz",V2="BB",V3="GHG",V4=1:10))
# doubled quote inside a quoted field followed by an embedded newline
test(1445, fread(testDir("doublequote_newline.csv"))[7:10], data.table(A=c(1L,1L,2L,1L), B=c("a","embedded \"\"field\"\"\nwith some embedded new\nlines as well","not this one","a")))
# the example from #489 directly :
test(1446, fread('A,B,C\n233,"AN ""EMBEDDED"" QUOTE FIELD",morechars\n'), data.table(A=233L, B='AN ""EMBEDDED"" QUOTE FIELD', C='morechars'))
# # unescaped quoted subregion followed by newline
# # commented this test for now as the logic now is to redirect to normal checks
# test(1447, fread('A,B,C\n233,"an unescaped "embedded"
# region followed by newline",morechars\n'))
# when detecting types ...
test(1448.1, fread('A,B\n1,"embedded""\nquote"\n2,should be ok\n'),
data.table(A=1:2,B=c('embedded""\nquote','should be ok')))
test(1448.2, fread('A,B\n1,"embedded""
quote"\n2,should be ok\n'),
data.table(A=1:2,B=c('embedded""
quote','should be ok')))
if (test_bit64 && test_R.utils) {
# quoted multiline (scrambled data thanks to #810)
DT = data.table(
GPMLHTLN = as.integer64(c("3308386085360", "3440245203140", "1305220146734")),
BLYBZ = c(0L,4L,6L),
ZBJBLOAJAQI = c("LHCYS AYE ZLEMYA IFU HEI JG FEYE", "", ""),
JKCRUUBAVQ = c("", ".\\YAPCNXJ\\004570_850034_757\\VWBZSS_848482_600874_487_PEKT-6-KQTVIL-7_30\\IRVQT\\HUZWLBSJYHZ\\XFWPXQ-WSPJHC-00-0770000855383.KKZ", "")
)
test(1449.1, fread(testDir("quoted_multiline.csv.bz2"))[c(1L, 43:44), c(1L, 22:24)], DT)
test(1449.2, fread(testDir("quoted_multiline.csv.bz2"), integer64='character', select = 'GPMLHTLN')[c(1L, 43:44)][[1L]], DT[ , as.character(GPMLHTLN)])
}
# Fix for #927
DT = data.table(x=1L, y=2L)
test(1450, DT[, set(.SD, j="x", value=10L)], error=".SD is locked. Updating .SD by reference using := or set")
# Tests for shallow copy taking cols argument - not exported yet.
DT = setDT(lapply(1:5, sample, 10, TRUE))
ans1 = sapply(DT, address)
fans2 = function(DT, cols=NULL) sapply(shallow(DT, cols), address)
test(1451.1, ans1, fans2(DT)) # make sure default/old functionality is intact
test(1451.2, ans1[3:4], fans2(DT, 3:4)) # using integer column numbers
test(1451.3, ans1[c(5,2)], fans2(DT, c(5,2))) # using numeric column numbers
test(1451.4, ans1[c(4,2,4)], fans2(DT,c(4,2,4))) # using duplicate column numbers
test(1451.5, ans1[3:2], fans2(DT, c("V3", "V2"))) # using column names
test(1451.6, ans1[c(3,3)], fans2(DT, c("V3", "V3"))) # using duplicate column names
test(1451.7, shallow(DT, integer(0)), null.data.table()) # length-0 input work as intended as well.
test(1451.8, shallow(DT, character(0)), null.data.table()) # length-0 input work as intended as well.
test(1452, fread("notexist.csv"), error="File 'notexist.csv' does not exist.*getwd")
# Test for #802
if (test_R.utils) test(1453, fread(testDir("fread_line_error.csv.bz2"))[c(1,.N), c("V1","V24")],
data.table(V1=INT(3,32), V24=c(".Q8_2_0W_8_1_7_L-4-U-5_1YSV-S-3-5.X",".U5_5_8H_7_6_0_U-5-J-7_2GNY-J-3-5.X")),
warning=c("resolved improper quoting", "Stopped.*line 12. Expected 24 fields but found 47.*First discarded non-empty line: <<31,3-0-7 4:1:7.5 HVV,"))
# no-sep-found => sep="\n", use case for this in #738
test(1454.1, fread('"Foo"`"Bar"\n5`2\n',sep="`"), data.table(Foo=5L,Bar=2L))
test(1454.2, fread('"Foo"\n5\n',sep="`"), data.table(Foo=5L))
# Fix for #958 - Don't create secondary keys on .SD
DT <- data.table(a=c(1, 1, 1, 0, 0), b=c("A", "B", "A1", "A", "B"))
test(1455, DT[, nrow(.SD[b == 'B']), by=.(a)], data.table(a=c(1,0), V1=1L))
# chmatchdup ...
x1 = c("b", "a", "d", "a", "c", "a")
x2 = c("a", "a", "a")
x3 = c("d", "a", "a", "d", "a")
table = rep(letters[1:3], each=2)
test(1456.1, chmatchdup(x1, table), as.integer(c(3,1,NA,2,5,NA)))
test(1456.2, chmatchdup(x2, table), as.integer(c(1,2,NA)))
test(1456.3, chmatchdup(x3, table), as.integer(c(NA,1,2,NA,NA)))
test(1457.1, chmatchdup(c("x","x","x","x"), c("x","y","x","x","y","z")), INT(1,3,4,NA))
test(1457.2, base::pmatch(c("x","x","x","x"), c("x","y","x","x","y","z")), INT(1,3,4,NA))
test(1457.3, chmatchdup(c("x","x"), c("x","y")), INT(1,NA))
# Add tests for which_
x = sample(c(-5:5, NA), 25, TRUE)
test(1458.1, which(x > 0), which_(x > 0)) # default is TRUE
test(1458.2, which(x > 0), which_(x > 0, TRUE)) # test explicitly
test(1458.3, which(!x > 0), which_(x > 0, FALSE))
# Fix for #982. Testing subsetDT on complex/raw vectors, and added tests for other types.
DT = data.table(a=c(1:3,NA_integer_), b=c(1,2,3,NA), c=as.complex(c(1:3,NA)), d=as.raw(1:4),
e=as.list(1:4), f=c(FALSE,FALSE,TRUE,NA), g=c("a", "b", "c", NA_character_))
test(1459.01, .Call("CsubsetDT", DT, which(DT$a > 2), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE]))
test(1459.02, .Call("CsubsetDT", DT, which(DT$b > 2), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE]))
test(1459.03, .Call("CsubsetDT", DT, which(Re(DT$c) > 2), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE]))
test(1459.04, .Call("CsubsetDT", DT, which(DT$d > 2), seq_along(DT)), setDT(as.data.frame(DT)[3:4, , drop=FALSE]))
test(1459.05, .Call("CsubsetDT", DT, which(DT$f), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE]))
test(1459.06, .Call("CsubsetDT", DT, which(DT$g == "c"), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE]))
test(1459.07, .Call("CsubsetDT", DT, which(DT$a > 2 | is.na(DT$a)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,]))
test(1459.08, .Call("CsubsetDT", DT, which(DT$b > 2 | is.na(DT$b)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,]))
test(1459.09, .Call("CsubsetDT", DT, which(Re(DT$c) > 2 | is.na(DT$c)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,]))
test(1459.10, .Call("CsubsetDT", DT, which(DT$f | is.na(DT$f)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,]))
test(1459.11, .Call("CsubsetDT", DT, which(DT$g == "c" | is.na(DT$g)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,]))
test(1459.12, .Call("CsubsetDT", DT, 5L, seq_along(DT)), setDT(as.data.frame(DT)[5,]))
# Test for na.omit with list, raw and complex types
DT = data.table(x=c(1L,1L,NA), y=c(NA, NA, 1), z=as.raw(1:3), w=list(1,NA,2), v=c(1+5i, NA, NA))
test(1460.1, na.omit(DT, cols="w"), DT)
test(1460.2, na.omit(DT, cols="v"), DT[1])
test(1460.3, na.omit(DT, cols=c("v", "y")), DT[0])
test(1460.4, na.omit(DT, cols=c("z", "v")), DT[1])
test(1460.5, na.omit(DT, cols=c("w", "v")), DT[1])
# Fix for #985
DT = data.table(x=c("a", "a", "b", "b"), v1=sample(4), v2=sample(4))
test(1461.1, DT[, c(lapply(.SD, mean), lapply(.SD, sd)), by=x],
DT[, c(lapply(.SD, function(x) mean(x)), lapply(.SD, function(x) sd(x))), by = x])
# Tests for #994
DT = data.table(x=c("a", "a", "b", "b"), v1=sample(4), v2=sample(4))
cols = c("v1", "v2")
test(1462.1, DT[, mget(cols, as.environment(-1))], DT[, cols, with=FALSE]) # as.environment needed for testing on pre-R3.0.0 which we don't want to depend on yet
test(1462.2, DT[, mget(cols[1], as.environment(-1))], DT[, cols[1], with=FALSE])
test(1462.3, DT[, sum(unlist(mget(cols, as.environment(-1)))), by=x], DT[, sum(unlist(.SD)), by=x, .SDcols=cols])
# test for 'shift'
x=1:5
y=factor(x)
test(1463.01, shift(x,1L), as.integer(c(NA, 1:4)))
test(1463.02, shift(x,1:2), list(as.integer(c(NA, 1:4)), as.integer(c(NA, NA, 1:3))))
test(1463.03, shift(x,1L, 0L), as.integer(c(0L, 1:4)))
test(1463.04, shift(x,1L, type="lead"), as.integer(c(2:5, NA)))
test(1463.05, shift(x,1:2, type="lead"), list(as.integer(c(2:5, NA)), as.integer(c(3:5, NA, NA))))
test(1463.06, shift(x,1L, 0L, type="lead"), as.integer(c(2:5, 0L)))
test(1463.07, shift(y,1L), factor(c(NA,1:4), levels=1:5))
test(1463.08, shift(y,1L, type="lead"), factor(c(2:5, NA), levels=1:5))
x=as.numeric(x)
test(1463.09, shift(x,1L), as.numeric(c(NA, 1:4)))
test(1463.10, shift(x,1:2), list(as.numeric(c(NA, 1:4)), as.numeric(c(NA, NA, 1:3))))
test(1463.11, shift(x,1L, 0L), as.numeric(c(0L, 1:4)))
test(1463.12, shift(x,1L, type="lead"), as.numeric(c(2:5, NA)))
test(1463.13, shift(x,1:2, type="lead"), list(as.numeric(c(2:5, NA)), as.numeric(c(3:5, NA, NA))))
test(1463.14, shift(x,1L, 0L, type="lead"), as.numeric(c(2:5, 0L)))
if (test_bit64) {
x=as.integer64(x)
test(1463.15, shift(x,1L), as.integer64(c(NA, 1:4)))
test(1463.16, shift(x,1:2), list(as.integer64(c(NA, 1:4)), as.integer64(c(NA, NA, 1:3))))
test(1463.17, shift(x,1L, 0L), as.integer64(c(0L, 1:4)))
test(1463.18, shift(x,1L, type="lead"), as.integer64(c(2:5, NA)))
test(1463.19, shift(x,1:2, type="lead"), list(as.integer64(c(2:5, NA)), as.integer64(c(3:5, NA, NA))))
test(1463.20, shift(x,1L, 0L, type="lead"), as.integer64(c(2:5, 0L)))
}
x=as.character(x)
test(1463.21, shift(x,1L), as.character(c(NA, 1:4)))
test(1463.22, shift(x,1:2), list(as.character(c(NA, 1:4)), as.character(c(NA, NA, 1:3))))
test(1463.23, shift(x,1L, 0L), as.character(c(0L, 1:4)))
test(1463.24, shift(x,1L, type="lead"), as.character(c(2:5, NA)))
test(1463.25, shift(x,1:2, type="lead"), list(as.character(c(2:5, NA)), as.character(c(3:5, NA, NA))))
test(1463.26, shift(x,1L, 0L, type="lead"), as.character(c(2:5, 0L)))
x=c(TRUE,FALSE,TRUE,FALSE,TRUE)
test(1463.27, shift(x,1L), c(NA, x[-5L]))
test(1463.28, shift(x,1:2), list(c(NA, x[-5L]), c(NA, NA, x[-(4:5)])))
test(1463.29, shift(x,1L, 0L), c(FALSE, x[-5L]))
test(1463.30, shift(x,1L, type="lead"), c(x[-1L], NA))
test(1463.31, shift(x,1:2, type="lead"), list(c(x[-1L],NA), c(x[-(1:2)],NA,NA)))
test(1463.32, shift(x,1L, 0L, type="lead"), c(x[-(1)], FALSE))
# for list of list, #1595
x = data.table(foo = c(list(c("a","b","c")), list(c("b","c")), list(c("a","b")), list(c("a"))), id = c(1,1,2,2))
test(1463.33, x[, shift(list(foo)), by=id],
data.table(id=c(1,1,2,2), V1=list(NA, c("a", "b", "c"), NA, c("a", "b"))))
test(1463.34, x[, shift(list(foo), type="lead", fill=NA_integer_), by=id],
data.table(id=c(1,1,2,2), V1=list(c("b", "c"), NA_integer_, c("a"), NA_integer_)))
# Fix for #1009 segfault in shift
val = runif(1)
test(1463.35, shift(val, 2L), NA_real_)
test(1463.36, shift(val, 2L, type="lead"), NA_real_)
test(1463.37, shift(1:5, 1L, fill=c(1:2)), error="fill must be a vector of length")
test(1463.38, shift(mean), error="type 'closure' passed to shift(). Must be a vector, list, data.frame or data.table")
# add tests for date and factor?
# test for 'give.names=TRUE' on vectors
x = 1:10
nm = c("x_lag_1", "x_lag_2")
ans = list(as.integer(c(NA, 1:9)), as.integer(c(NA, NA, 1:8)))
setattr(ans, 'names', nm)
test(1463.39, shift(x, 1:2, give.names=TRUE), ans)
if (test_nanotime) {
test(1463.40, shift(nanotime(1:4) ), c(nanotime::nanotime(NA), nanotime::nanotime(1:3)));
test(1463.41, shift(nanotime(1:4), fill=0L), c(nanotime::nanotime(0L), nanotime::nanotime(1:3)));
}
# FR #686
DT = data.table(a=rep(c("A", "B", "C", "A", "B"), c(2,2,3,1,2)), foo=1:10)
# Seemingly superfluous 'foo' is needed to test fix for #1942
DT[, b := as.integer(factor(a))][, c := as.numeric(factor(a))]
test(1464.01, rleidv(DT, "a"), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L))
test(1464.02, rleid(DT$a), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L))
test(1464.03, rleidv(DT, "b"), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L))
test(1464.04, rleid(DT$b), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L))
test(1464.05, rleidv(DT, "c"), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L))
test(1464.06, rleid(DT$c), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L))
test(1464.07, rleid(as.raw(c(3L, 1L, 2L))), error="Type 'raw' not supported")
test(1464.08, rleidv(DT, 0), error="specify non existing column*.*0")
test(1464.09, rleidv(DT, 5), error="specify non existing column*.*5")
test(1464.10, rleidv(DT, 1:4), 1:nrow(DT))
set.seed(1)
DT = data.table( sample(1:2,20,replace=TRUE), sample(1:2,20,replace=TRUE), sample(1:2,20, replace=TRUE))
test(1464.11, rleidv(DT, 1:4), error="specify non existing column*.*4")
test(1464.12, rleidv(DT, 1:2), ans<-INT(1,2,3,4,5,6,6,6,7,8,8,9,10,11,12,13,14,15,16,17))
test(1464.13, rleidv(DT, 2:1), ans)
test(1464.14, rleidv(DT, c(3,1)), INT(1,1,2,2,3,4,5,5,6,7,8,9,10,11,12,13,14,15,16,17))
if (test_xts) {
Sys.unsetenv("_R_CHECK_LENGTH_1_LOGIC2_")
# package xts has an issue with an && clause (https://github.com/joshuaulrich/xts/pull/269). When that is fixed in xts and released to CRAN, we can remove this Sys.unsetenv
# Sys.setenv is called again at the end of this xts branch. The original env variable value was stored at the top of this file and restored at the end.
# data.table-xts conversion #882
# Date index
dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5))
xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index)
dt_xt = as.data.table(xt)
xt_dt = as.xts.data.table(dt)
test(1465.01, all.equal(dt, dt_xt, check.attributes = FALSE))
test(1465.02, xt, xt_dt)
# POSIXct index
dt <- data.table(index = as.POSIXct(as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5))
xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index)
dt_xt = as.data.table(xt)
xt_dt = as.xts.data.table(dt)
test(1465.03, all.equal(dt, dt_xt, check.attributes = FALSE))
test(1465.04, xt, xt_dt)
# index types returned from to.period
dt = data.table(index = as.Date((as.Date("2014-12-12") - 729):as.Date("2014-12-12"), origin = "1970-01-01"), quantity = as.numeric(rep(c(1:5), 73)), value = rep(c(1:73) * 100, 5))
xt = as.xts(matrix(data = c(dt$quantity, dt$value), ncol = 2, dimnames = list(NULL, c("quantity", "value"))), order.by = dt$index)
xt_w = xts::to.weekly(xt)
xt_dt_xt_w = as.xts.data.table(as.data.table(xt_w))
xt_m = xts::to.monthly(xt)
xt_dt_xt_m = as.xts.data.table(as.data.table(xt_m))
xt_q = xts::to.quarterly(xt)
xt_dt_xt_q = as.xts.data.table(as.data.table(xt_q))
xt_y = xts::to.yearly(xt)
xt_dt_xt_y = as.xts.data.table(as.data.table(xt_y))
test(1465.05, all.equal(xt_w, xt_dt_xt_w, check.attributes = FALSE))
test(1465.06, all.equal(xt_m, xt_dt_xt_m, check.attributes = FALSE))
test(1465.07, all.equal(xt_q, xt_dt_xt_q, check.attributes = FALSE))
test(1465.08, all.equal(xt_y, xt_dt_xt_y, check.attributes = FALSE))
test(1465.09, xts::last(1:5), 5L) # was test 1531
# xts issue from Joshua, #1347
x = as.Date(1:5, origin="2015-01-01")
test(1465.10, last(x), tail(x, 1L)) # was test 1559
x = xts(1:100, Sys.Date()+1:100)
test(1465.11, last(x,10), x[91:100,]) # was test 841
# The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts.
# But that isn't tested by R CMD check because xts is loaded above data.table, there.
# So to make this test is relevant, run it in fresh R session directly, after: "require(xts);require(data.table)"
# rather than: "require(data.table);require(xts)"
# Which was the main thrust of bug#2312 fixed in v1.8.3
# fix for #1484; was test 1589
x = xts::as.xts(8, order.by = as.Date("2016-01-03"))
test(1465.12, all.equal(as.data.table(x), data.table(index = as.Date("2016-01-03"), V1 = 8), check.attributes=FALSE))
# IDate support in as.xts.data.table #1499; was test 1663
dt <- data.table(date = c(as.IDate("2014-12-31"),
as.IDate("2015-12-31"),
as.IDate("2016-12-31")),
nav = c(100,101,99),
key = "date")
dt.xts <- as.xts.data.table(dt)
test(1465.13, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31")))
# additional coverage missing uncovered in #3117
dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5))
xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index)
test(1465.14, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index'])
names(xt)[1L] = 'index'
test(1465.15, as.data.table(xt), error = 'Input xts object should not')
names(xt)[1L] = 'quantity'
setcolorder(dt, c(3, 1, 2))
if (base::getRversion() < "3.6.0") as.xts = as.xts.data.table # fix for when we cannot register s3method for suggested dependency #3286
test(1465.16, as.xts(dt), error = 'data.table must have a time based')
setcolorder(dt, c(2, 3, 1))
dt[ , char_col := 'a']
test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric')
# 890 -- key argument for as.data.table.xts
x = xts(1:10, as.Date(1:10, origin = "1970-01-01"))
old = options(datatable.verbose=FALSE)
test(1465.18, capture.output(as.data.table(x, key="index")),
c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2",
" 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5",
" 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8",
" 9: 1970-01-10 9", "10: 1970-01-11 10"))
options(old)
Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE)
}
# as.data.table.default #969
ar <- array(NA, dim=c(10,4),dimnames = list(NULL,paste("col",1:4,sep="")))
test(1466.1, as.data.table(as.data.frame(ar)), as.data.table(ar)) # array type
x <- rep(Sys.time(),3)
test(1466.2, as.data.table(as.data.frame(x)), as.data.table(x)) # posix type
# fix for #1001, #1002 and #759
# When adding a column, even if i results in no rows, the RHS needs to evaluate so we can know the
# column type to create. Always create the column for consistency that does not depend on the data in i
for (bool in c(FALSE,TRUE)) {
options(datatable.auto.index=bool)
DT = data.table(a=1:2)
test(1467.01 + bool*0.03, copy(DT)[a==3, b:=notExist+1], error="notExist")
test(1467.02 + bool*0.03, copy(DT)[a==3, b:=a+5L], data.table(a=1:2, b=NA_integer_))
test(1467.03 + bool*0.03, copy(DT)[a==3, b:=a+5], data.table(a=1:2, b=NA_real_))
}
test(1467.07, getOption("datatable.auto.index")) # ensure to leave TRUE
# fix for first bug reported in #1006 on 'foverlaps()'
x <- c(-0.1, 0, 0.1)
n <- length(x)
dt.ref <- data.table(start=x[-n], end=x[-1], key=c("start", "end"))
dt.query <- data.table(q1=c(-0.2, -0.05, 0.05, 0.15), q2=c(-0.2, -0.05, 0.05, 0.15), key=c("q1", "q2"))
ans=cbind(dt.ref[, .(start,end)], dt.query[2:3, .(q1,q2)])
setkey(ans, q1,q2)
test(1468.1, foverlaps(dt.query, dt.ref, nomatch=0L), ans)
# fix and additional tests for #1006 following OP's follow-up.
dt1 = data.table(x=c(-6.36917800737546, -2.19964384651646),
y=c(-2.19964384651646, 4.07116428752538))
dt2 = data.table(x= 2.91816502571793, y=2.91816502571793)
setkey(dt1)
setkey(dt2)
test(1468.2, foverlaps(dt2, dt1, which=TRUE), data.table(xid=1L, yid=2L))
dt1 = data.table(x=c(-6,-3), y=c(-3,4))
dt2 = data.table(x=3,y=3)
setkey(dt1)
setkey(dt2)
test(1468.3, foverlaps(dt2, dt1, which=TRUE), data.table(xid=1L, yid=2L))
# Fix for #1010 (discovered while fixing #1007). Don't retain key if i had no key, but irows is sorted, and roll != FALSE... See example in #1010.
DT = data.table(x=c(-5,5), y=1:2, key="x")
test(1469.1, key(DT[J(c(2,0)), roll=TRUE]), NULL)
test(1469.2, key(DT[J(c(2,0)), .(x,y), roll=TRUE]), NULL)
test(1469.3, key(DT[J(c(2,0)), y, roll=TRUE, by=.EACHI]), NULL)
test(1469.4, key(DT[J(c(2,0))]), NULL)
test(1469.5, key(DT[SJ(c(2,0)), roll=TRUE]), "x")
test(1469.6, key(DT[J(c(2,0)), roll="nearest"]), NULL)
# 1007 fix, dealing with Inf and -Inf correctly in rolling joins.
DT = data.table(x=c(-Inf, 3, Inf), y=1:3, key="x")
test(1470.1, DT[J(c(2,-Inf,5,Inf)), roll=Inf], data.table(x=c(2,-Inf,5,Inf), y=c(1L, 1:3)))
test(1470.2, DT[J(c(2,-Inf,5,Inf)), roll=10], data.table(x=c(2,-Inf,5,Inf), y=INT(c(NA, 1, 2, 3))))
test(1470.3, DT[SJ(c(2,-Inf,5,Inf)), roll=Inf], data.table(x=c(-Inf,2,5,Inf), y=c(1L, 1:3), key="x"))
# 1006, second bug with -Inf, now that #1007 is fixed.
x <- c(-Inf, -0.1, 0, 0.1, Inf)
n <- length(x)
dt.ref <- data.table(start=x[-n], end=x[-1], key=c("start", "end"))
dt.query <- data.table(q1=c(-0.2, -0.05, 0.05, 0.15), q2=c(-0.2, -0.05, 0.05, 0.15), key=c("q1", "q2"))
test(1471, foverlaps(dt.query, dt.ref), data.table(dt.ref, dt.query, key=c("q1", "q2")))
# #1014 (segfault) fix
test(1472, shift(1, 1:2, NA, 'lag'), list(NA_real_, NA_real_))
# #528, type=equal simple test
dt1 = data.table(x=1:5, y=6:10)
dt2 = data.table(x=3:7, y=8:12)
setkey(dt1)
setkey(dt2)
test(1473, foverlaps(dt1,dt2, which=TRUE, nomatch=NULL, type="equal"),
data.table(xid=3:5, yid=1:3))
# More tests for `frankv`, #760
DT = data.table(x=c(4, 1, 4, NA, 1, NA, 4), y=c(1, 1, 1, 0, NA, 0, 2))
test(1474.1, frankv(DT, "y", ties.method="dense"), frankv(DT$y, ties.method="dense"))
test(1474.2, frank(DT, y, ties.method="dense"), frank(DT$y, ties.method="dense"))
test(1474.3, frankv(DT, "y", order=-1L, ties.method="dense"), frankv(-DT$y, ties.method="dense"))
test(1474.4, frank(DT, -y, ties.method="dense"), frank(-DT$y, ties.method="dense"))
# uniqueN, #884, part of #756 and part of #1019
DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6))
test(1475.01, uniqueN(DT), 10L)
test(1475.02, DT[, .(uN=uniqueN(.SD)), by=A], data.table(A=1:3, uN=c(3L,4L,3L)))
# specialized uniqueN for logical vectors, PR#2648
test(1475.03, uniqueN(c(NA, TRUE, FALSE)), 3L)
test(1475.04, uniqueN(c(NA, TRUE, FALSE), na.rm = TRUE), 2L)
test(1475.05, uniqueN(c(TRUE, FALSE), na.rm = TRUE), 2L)
test(1475.06, uniqueN(c(TRUE, FALSE)), 2L)
test(1475.07, uniqueN(c(TRUE, NA)), 2L)
test(1475.08, uniqueN(c(TRUE, NA), na.rm=TRUE), 1L)
test(1475.09, uniqueN(c(FALSE, NA)), 2L)
test(1475.10, uniqueN(c(FALSE, NA), na.rm=TRUE), 1L)
test(1475.11, uniqueN(c(NA,NA)), 1L)
test(1475.12, uniqueN(c(NA,NA), na.rm=TRUE), 0L)
test(1475.13, uniqueN(NA), 1L)
test(1475.14, uniqueN(NA, na.rm=TRUE), 0L)
test(1475.15, uniqueN(logical()), 0L)
test(1475.16, uniqueN(logical(), na.rm=TRUE), 0L)
# preserve class attribute in GForce mean (and sum)
DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day")))
test(1476.1, DT[, .(y=mean(y)), x], setDT(aggregate(y ~ x, DT, mean)))
# test for 'transpose' of a list
ll = lapply(1:12, function(x) {
if (x <= 3) sample(10, sample(5:10, 1L))
else if (x > 3 & x <= 6) as.numeric(sample(101:115, sample(7:12, 1L)))
else if (x > 7 & x <= 9) sample(c(TRUE, FALSE), sample(7:9, 1L), TRUE)
else sample(letters, sample(5:10, 1L))
})
ans1 = setDT(transpose(ll))
ans2 = setDT(lapply(seq_along(ans1), function(x) sapply(ll, `[`, x)))
test(1477.01, ans1, ans2)
ans1 = setDT(transpose(ll[4:6]))
ans2 = setDT(lapply(seq_along(ans1), function(x) sapply(ll[4:6], `[`, x)))
test(1477.02, ans1, ans2)
ans1 = setDT(transpose(ll[8:9]))
ans2 = setDT(lapply(seq_along(ans1), function(x) sapply(ll[8:9], `[`, x)))
test(1477.03, ans1, ans2)
# class is preserved?
dt = data.table(x=1:5, y=6:10)
test(1477.04, transpose(dt), as.data.table(t(as.matrix(dt))))
# factor column coerce to character
ll = list(factor(letters[1:5]), factor(letters[6:8]))
test(1477.05, transpose(ll), list(c("a", "f"), c("b", "g"), c("c", "h"), c("d", NA), c("e", NA)))
# for data.frames
test(1477.06, transpose(data.frame(x=1:2, y=3:4)), data.frame(V1=c(1L,3L), V2=c(2L,4L)))
# test for `tstrsplit`
ll = sapply(ll, paste, collapse=",")
test(1477.07, transpose(strsplit(ll, ",", fixed=TRUE)), tstrsplit(ll, ",", fixed=TRUE))
test(1477.08, transpose(1:5), error="l must be a list")
test(1477.09, transpose(list(as.complex(c(1, 1+5i)))), error="Unsupported column type")
test(1477.10, transpose(list(list(1:5))), error="Item 1 of list input is")
test(1477.11, transpose(as.list(1:5), fill=1:2), error="fill must be a length 1 vector")
test(1477.12, transpose(as.list(1:5), ignore.empty=NA), error="ignore.empty should be logical TRUE/FALSE")
test(1477.13, transpose(list()), list())
# #480 `setDT` and 'lapply'
ll = list(data.frame(a=1), data.frame(x=1, y=2), NULL, list())
ll <- lapply(ll, setDT)
test(1478.1, sapply(ll, truelength), c(1025L, 1026L, 1024L, 1024L))
test(1478.2, sapply(ll, length), INT(1,2,0,0))
# rbindlist stack imbalance issue, #980.
test(1479, rbindlist(replicate(4,rbindlist(replicate(47, NULL),
use.names=TRUE, fill=TRUE)), use.names=TRUE, fill=TRUE), null.data.table())
# #936, plonking list column over a factor column by reference
DT = data.table(x = factor(c("a", "b c", "d e f")))
test(1480.1, DT[, x := strsplit(as.character(x), " ")], ans<-data.table(x=list("a", letters[2:3], letters[4:6])))
DT = data.table(x = factor(c("a", "b c", "d e f")))
test(1480.2, DT[, x := .(strsplit(as.character(x), " "))], ans)
DT = data.table(x = factor(c("a", "b c", "d e f")))
test(1480.3, DT[, x := list(strsplit(as.character(x), " "))], ans)
# #970, over-allocation issue
a=data.frame(matrix(1,ncol=101L))
options(datatable.alloccol=100L)
ans1 = data.table(a)
options(datatable.alloccol=101L)
ans2 = data.table(a)
test(1481.1, ans2, ans1)
options(datatable.alloccol=0L)
ans3 = data.table(a)
test(1481.2, ans3, ans1)
options(datatable.alloccol=1L)
ans4 = data.table(a)
test(1481.3, ans4, ans1)
options(datatable.alloccol=1024L)
# #479, check := assignment in environment (actual case is when loaded from disk, but we'll just simulate a scenario here).
ee = new.env()
ee$DT = data.frame(x=1L, y=1:3)
setattr(ee$DT, 'class', c("data.table", "data.frame"))
test(1482.1, truelength(ee$DT), 0L) # make sure that the simulated environment is right.
test(1482.2, ee$DT[, z := 3:1], data.table(x=1L, y=1:3, z=3:1), warning="Invalid .internal.selfref detected and")
test(1482.3, truelength(ee$DT), 1027L)
test(1482.4, ee$DT[, za := 4:6], data.table(x=1L, y=1:3, z=3:1, za=4:6))
test(1482.5, truelength(ee$DT), 1027L) # should have used spare slot i.e. no increase in tl
# Fix for #499 and #945
x <- data.table(k=as.factor(c(NA,1,2)),v=c(0,1,2), key="k")
y <- data.table(k=as.factor(c(NA,1,3)),v=c(0,1,3), key="k")
test(1483.1, x[y], data.table(k=factor(c(NA,1,3)), v=c(0,1,NA), i.v=c(0,1,3), key="k"))
test(1483.2, merge(x,y,all=TRUE), data.table(k=factor(c(NA,1,2,3)), v.x=c(0,1,2,NA), v.y=c(0,1,NA,3), key="k"))
x <- data.table(country="US")
y <- data.table(country=factor("USA"))
test(1483.3, merge(x,y,by="country",all=TRUE), data.table(country=factor(c("US", "USA")), key="country"))
setkey(y)
test(1483.4, y[x], data.table(country="US", key="country"))
# NULL items should be removed when making data.table from list, #842
# Original fix for #842 added a branch in as.data.table.list() using point()
# Then PR#3471 moved logic from data.table() into as.data.table.list() and now removes NULL items up front, so longer need for the branch
# Since the logic was changed, this test was strengthened to explicity test the result rather than compare two calls to SomeFunction()
SomeFunction <- function(x, setnull=1L) {
ans <- replicate(length(x), list("bla1", "bla2"), simplify=FALSE)
ans[setnull] <- list(NULL)
return(ans)
}
DT <- data.table(ID=1:3, key="ID")
test(1484.1, DT[, SomeFunction(ID, setnull=1L)], ans<-data.table(V1=list("bla1","bla2"), V2=list("bla1","bla2")))
test(1484.2, DT[, SomeFunction(ID, setnull=2L)], ans)
# Fix for #868
vals = c("setosa", "versicolor", "virginica")
test(1485, as.data.table(combn(unique(iris$Species),2)), data.table(vals[1:2], vals[c(1,3)], vals[2:3]))
# depends on bug fix to combn() in R 3.1.0, which is now stated dependency
# Fix for #955
DT <- data.table(Time=.POSIXct(0, tz="UTC")+0:1, Value=1:2)
options(datatable.auto.index=FALSE) # Have to turn off to avoid error.
ans1.1 = DT[Time==Time[1]]
ans2.1 = DT[Time==.POSIXct(0, tz="UTC")]
options(datatable.auto.index=TRUE)
ans1.2 = DT[Time==Time[1]]
ans2.2 = DT[Time==.POSIXct(0, tz="UTC")]
test(1486.1, as.data.frame(ans1.1), as.data.frame(ans1.2))
test(1486.2, as.data.frame(ans2.1), as.data.frame(ans2.1))
# Fix for #832
x <- matrix(1:9, ncol=3)
setattr(x, "names", paste("V", seq_len(length(x)), sep = ""))
test(1487.1, setattr(x, "class", c("data.table", "data.frame")), error="Internal structure doesn't seem to be a list")
x <- matrix(1:9, ncol=3)
class(x) = c("data.table", "data.frame")
# not sure how to test this one, so using `tryCatch`
test(1487.2, tryCatch(print(x), error=function(k) "bla"), "bla")
# Fix for #1043
DT = data.table(grp=LETTERS[1:2], categ=rep(c("X","Y"), each=2L), condition=rep(c("P","Q"), each=4L), value=sample(8))
tbl = with(DT, table(grp, categ, condition))
ans1 = setnames(setDF(data.table(tbl)), "N", "Freq")
ans2 = data.frame(tbl)
ans2[1:3] = lapply(ans2[1:3], as.character)
test(1488, ans1, ans2)
# joins where x is integer type and i is logical type
DT = data.table(x=1:5, y=6:10, key="x")
test(1489.1, DT[.(TRUE)], error="Incompatible join types: x.x (integer) and i.V1 (logical)")
test(1489.2, DT[.(1L)], DT[1L])
test(1489.3, DT[.(1)], DT[1L])
# Fix for #932
DT <- data.table(v1 = c(1:3, NA), v2 = c(1,NA,2.5,NaN), v3=c(NA, FALSE, NA, TRUE), v4=c("a", NA, "b", "c"))
options(datatable.auto.index = TRUE) # just to be sure
setindex(DT, v1)
test(1490.01, DT[v1==3], subset(DT, v1==3))
test(1490.02, DT[!v1==3], subset(DT, !v1==3))
test(1490.03, DT[v1==NA], subset(DT, v1==NA))
test(1490.04, DT[!v1==NA], subset(DT, !v1==NA))
setindex(DT, v2)
test(1490.05, DT[v2==2.5], subset(DT, v2==2.5))
test(1490.06, DT[!v2==2.5], subset(DT, !v2==2.5))
test(1490.07, DT[v2==NA], subset(DT, v2==NA))
test(1490.08, DT[!v2==NA], subset(DT, !v2==NA))
test(1490.09, DT[v2==NaN], subset(DT, v2==NaN))
test(1490.10, DT[!v2==NaN], subset(DT, !v2==NaN))
setindex(DT, v3)
test(1490.11, DT[v3==FALSE], subset(DT, v3==FALSE))
test(1490.12, DT[!v3==FALSE], subset(DT, !v3==FALSE))
test(1490.13, DT[v3==TRUE], subset(DT, v3==TRUE))
test(1490.14, DT[!v3==TRUE], subset(DT, !v3==TRUE))
test(1490.15, DT[v3==NA], subset(DT, v3==NA))
test(1490.16, DT[!v3==NA], subset(DT, !v3==NA))
test(1490.17, DT[(v3)], subset(DT, v3==TRUE))
test(1490.18, DT[!(v3)], subset(DT, !v3==TRUE))
setindex(DT, v4)
test(1490.19, DT[v4=="b"], subset(DT, v4=="b"))
test(1490.20, DT[!v4=="b"], subset(DT, !v4=="b"))
test(1490.21, DT[v4==NA], subset(DT, v4==NA))
test(1490.22, DT[!v4==NA], subset(DT, !v4==NA))
# test for #957 test
DT <- as.data.table(BOD)
options(datatable.auto.index=FALSE)
ans1 = DT[Time %in% c("1", "2")]
options(datatable.auto.index=TRUE)
ans2 = DT[Time %in% c("1", "2")]
test(1490.23, ans1, ans2)
# test for #961
DT <- as.data.table(cars)
options(datatable.auto.index=FALSE)
ans1 = DT[speed %in% list(1, 4)]
options(datatable.auto.index=TRUE)
ans2 = DT[speed %in% list(1, 4)]
test(1490.24, ans1, ans2)
# replace "." with "list" in 'j'
ee1 = quote(.(val = lm(x ~ .)))
ee2 = quote(.(v1=.(.SD), v2=.(min(y)), v3=.(.(x)), v4=.(x)))
ee3 = quote(.(v1=.(.SD), v2=.(lm(. ~ xx)), v3=.(.(x)), v4=.(x^2)))
ee4 = quote(c("a", "b") := .(.SD))
ee5 = quote(c("a", "b") := .(v1=x^2, v2 = .(.SD[[1L]])))
ee6 = quote(.(v1=.(.SD), v2=.(lm(. ~ xx)), v3=list(.(x)), v4=.(x^2)))
test(1491.1, replace_dot_alias(ee1), quote(list(val = lm(x ~ .))))
test(1491.2, replace_dot_alias(ee2), quote(list(v1=list(.SD), v2=list(min(y)), v3=list(list(x)), v4=list(x))))
test(1491.3, replace_dot_alias(ee3), quote(list(v1=list(.SD), v2=list(lm(. ~ xx)), v3=list(list(x)), v4=list(x^2))))
test(1491.4, replace_dot_alias(ee4), quote(c("a", "b") := list(.SD)))
test(1491.5, replace_dot_alias(ee5), quote(c("a", "b") := list(v1=x^2, v2 = list(.SD[[1L]]))))
test(1491.6, replace_dot_alias(ee6), quote(list(v1=list(.SD), v2=list(lm(. ~ xx)), v3=list(list(x)), v4=list(x^2))))
# Fix for #1050
dt = data.table(x=1:5, y=6:10)
options(datatable.auto.index=FALSE)
ans1 <- dt[x == 2.5]
options(datatable.auto.index=TRUE)
ans2 <- dt[x == 2.5]
test(1492, ans1, ans2)
# Fix for #497
dt = data.table(x=1:10, y=11:20)
test(1493, dt[, .(x=sum(x)),by= x %% 2, verbose=TRUE], data.table(`x%%2`=c(1,0), x=c(25L,30L)), output="by-expression 'x%%2' is not named")
# Fix for #705
DT1 = data.table(date=as.POSIXct("2014-06-22", format="%Y-%m-%d", tz="GMT"))
DT2 = data.table(date=as.Date("2014-06-23"))
test(1494.1, rbind(DT1, DT2), error="Class attribute on column")
test(1494.2, rbind(DT2, DT1), error="Class attribute on column")
# test 1495 has been added to melt's test section (fix for #1055)
# Fix for #1056
DT = data.table(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5])
test(1496, DT[, shift(v1, 1:2, NA, "lead", TRUE)], DT[, shift(.SD, 1:2, NA, "lead", TRUE), .SDcols=2L])
# Fix for #1066
DT = data.table(x=1, y=2, z=3, a=4, b=5, c=6)
test(1497, DT[, .SD, .SDcols = !c("a", "c")], DT[, !c("a", "c"), with=FALSE])
# Fix for #1060
DT = data.table(x=1, y=2, z=3, a=4, b=5, c=6)
test(1498.1, DT[, .SD, .SDcols=c(TRUE,FALSE)], DT[, c("x", "z", "b"), with=FALSE])
test(1498.2, DT[, .SD, .SDcols=!c(TRUE,FALSE)], DT[, !c("x", "z", "b"), with=FALSE])
# Fix for #1072
dt <- data.table(group1 = "a", group2 = "z", value = 1)
options(datatable.auto.index=FALSE)
ans1 = dt[group1 %in% c("a", "b"), sum(value), group2]
options(datatable.auto.index=TRUE)
ans2 = dt[group1 %in% c("a", "b"), sum(value), group2]
test(1499, ans1, ans2)
# Fix for #488
if (test_bit64) {
test(1500.1, fread("x,y\n3,\n", colClasses = list(integer64 = "y")),
data.table(x=3L, y=as.integer64(NA)))
# more tests after new fix
test(1500.2, fread("x,y\n0,12345678901234\n0,\n0,\n0,\n0,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n12345678901234,\n0,\n0,\n0,\n0,\n0,\n"),
data.table(x=as.integer64(c(rep(0L, 5L), rep(NA, 11), 12345678901234, rep(0L,5L))),
y=as.integer64(c(12345678901234, rep(NA,21)))))
x = c("12345678901234", rep("NA", 178), "a")
y = sample(letters, length(x), TRUE)
ll = paste(x,y, sep=",", collapse="\n")
test(1500.3, fread(ll, na.strings=NULL),
data.table(V1=x, V2=y))
x = c("12345678901234", rep("NA", 178), "0.5")
y = sample(letters, length(x), TRUE)
ll = paste(x,y, sep=",", collapse="\n")
test(1500.4, fread(ll), data.table(V1=suppressWarnings(as.numeric(x)), V2=y))
}
# fix for #1082
dt1 = data.table(x=rep(c("a","b","c"),each=3), y=c(1,3,6), v=1:9, key=c("x", "y"))
dt2 = copy(dt1)
test(1502.1, dt1["a", z := NULL], error="When deleting columns, i should not be provided")
# this shouldn't segfault on 'dt1[...]'
test(1502.2, dt1["a", z := 42L], dt2["a", z := 42L])
# fix for #1080
dt = data.table(col1 = c(1,2,3,2,5,3,2), col2 = c(0,9,8,9,6,5,4), key=c("col1"))
test(1503.1, uniqueN(dt, by=key(dt)), 4L) # default on key columns
test(1503.2, uniqueN(dt), 6L) # on all columns
test(1503.3, uniqueN(dt$col1), 4L) # on just that column
# .SDcols and with=FALSE understands colstart:colend syntax
dt = setDT(lapply(1:10, function(x) sample(3, 10, TRUE)))
# .SDcols
test(1504.01, dt[, lapply(.SD, sum), by=V1, .SDcols=V8:V10],
dt[, lapply(.SD, sum), by=V1, .SDcols=8:10])
test(1504.02, dt[, lapply(.SD, sum), by=V1, .SDcols=V10:V8],
dt[, lapply(.SD, sum), by=V1, .SDcols=10:8])
test(1504.03, dt[, lapply(.SD, sum), by=V1, .SDcols=-(V8:V10)],
dt[, lapply(.SD, sum), by=V1, .SDcols=-(8:10)])
test(1504.04, dt[, lapply(.SD, sum), by=V1, .SDcols=!(V8:V10)],
dt[, lapply(.SD, sum), by=V1, .SDcols=!(8:10)])
# with=FALSE and auto with=FALSE tests as from v1.9.8
test(1504.05, dt[, V8:V10, with=FALSE], dt[, 8:10, with=FALSE])
test(1504.06, dt[, V8:V10], dt[, 8:10, with=FALSE])
test(1504.07, dt[, V10:V8, with=FALSE], dt[, 10:8, with=FALSE])
test(1504.08, dt[, V10:V8], dt[, 10:8, with=FALSE])
test(1504.09, dt[, -(V8:V10), with=FALSE], dt[, -(8:10), with=FALSE])
test(1504.10, dt[, -(V8:V10)], dt[, -(8:10), with=FALSE])
test(1504.11, dt[, !(V8:V10), with=FALSE], dt[, !(8:10), with=FALSE])
test(1504.12, dt[, !(V8:V10)], dt[, !(8:10), with=FALSE])
# Fix for #1083
dt = data.table(x=1:4, y=c(TRUE,FALSE))
test(1505.1, as.matrix(dt), as.matrix(as.data.frame(dt)))
# setcolorder works with data.frames, #1018
dt = data.table(x=1, y=2)
test(1506, setcolorder(dt, c("y", "x")), data.table(y=2, x=1))
# tstrsplit, #1094
# factor to character
x = factor(paste(letters[1:5], letters[6:10], sep="-"))
test(1507.1, tstrsplit(x, "-"), list(letters[1:5], letters[6:10]))
# type.convert
x = paste(letters[1:5], 1:5, sep="-")
test(1507.2, tstrsplit(x, "-"), list(letters[1:5], as.character(1:5)))
test(1507.3, tstrsplit(x, "-", type.convert=TRUE), list(letters[1:5], 1:5))
# implementing #575, keep.rownames can take a name
x = matrix(1:6, ncol=2)
rownames(x) = letters[3:1]
test(1508.1, as.data.table(x, keep.rownames="bla"), data.table(bla=letters[3:1], x))
x = as.data.frame(x)
test(1508.2, as.data.table(x, keep.rownames="bla"), data.table(bla=letters[3:1], x))
x = sample(10); setattr(x, 'names', letters[1:10])
test(1508.3, as.data.table(x, keep.rownames="bla"), data.table(bla=letters[1:10], x=unname(x)))
# also for setDT
df = data.frame(x=1:5, y=6:10, row.names=letters[5:1])
ans = data.table(foo=letters[5:1], df)
test(1508.4, setDT(df, keep.rownames="foo"), ans)
# #1509 test added for melt above.
# #1510 transpose converts NULL to NAs
ll = list(1:2, NULL, 3:4)
test(1510.1, transpose(ll), list(c(1L, NA, 3L), c(2L, NA, 4L)))
test(1510.2, transpose(ll, ignore.empty=TRUE), list(c(1L, 3L), c(2L, 4L)))
# setorder can reorder data.frames too, #1018
DF = data.frame(x=sample(3,10,TRUE), y=sample(letters[1:2], 10, TRUE))
rownames(DF) = sample(letters, 10)
ans = DF[order(-xtfrm(DF$y), DF$x), ]
test(1511, ans, setorder(DF, -y, x))
# fix for #1108
if (test_bit64) {
dt <- data.table(id = as.integer64(1:3), a = c("a", "b", "c"), key = "id")
test(1512.1, dt[.(2)], data.table(id=2, a="b", key="id"))
test(1512.2, dt[.(2L)], data.table(id=2L, a="b", key="id"))
dt <- data.table(id = as.numeric(1:3), a = c("a", "b", "c"), key = "id")
test(1512.3, dt[.(2L)], data.table(id=2L, a="b", key="id"))
test(1512.4, dt[.(as.integer64(2))], data.table(id=as.integer64(2), a="b", key="id"))
dt <- data.table(id = 1:3, a = c("a", "b", "c"), key = "id")
test(1512.5, dt[.(2)], data.table(id=2L, a="b", key="id"))
test(1512.6, dt[.(as.integer64(2))], data.table(id=as.integer64(2), a="b", key="id"))
}
# setDT gains key argument, #1121
X = list(a = 4:1, b=runif(4))
test(1513, setkey(as.data.table(X), a), setDT(X, key="a"))
# Adding tests for `isReallyReal`
x = as.numeric(sample(10))
test(1514.1, isReallyReal(x), 0L)
x = as.numeric(sample(c(1:5, NA)))
test(1514.2, isReallyReal(x), 0L) # NAs in numeric can be coerced to integer NA without loss
x = c(1:2, NaN, NA)
test(1514.3, isReallyReal(x), 3L)
x = c(1:2, Inf, NA)
test(1514.4, isReallyReal(x), 3L)
x = c(1:2, -Inf, NA)
test(1514.5, isReallyReal(x), 3L)
x = runif(2)
test(1514.6, isReallyReal(x), 1L)
x = numeric()
test(1514.7, isReallyReal(x), 0L)
test(1514.8, isReallyReal(9L), 0L)
# #1091
options(datatable.prettyprint.char = 5L)
DT = data.table(x=1:2, y=c("abcdefghijk", "lmnopqrstuvwxyz"))
test(1515.1, grep("abcde...", capture.output(print(DT))), 2L)
options(datatable.prettyprint.char = NULL)
# test 1516: chain setnames() - used while mapping source to target columns
SRC = data.table(x=1:2, y=c("abcdefghij", "klmnopqrstuv"), z=rnorm(2))
src_cols <- c("y","z")
tgt_cols <- c("name","value")
DT <- SRC[, src_cols, with=FALSE][, setnames(.SD, tgt_cols)]
test(1516.1, names(SRC), c("x","y","z")) # src not altered by ref
test(1516.2, names(DT), tgt_cols) # target expected
test(1516.3, unname(unclass(DT[, tgt_cols, with=FALSE])), unname(unclass(SRC[,src_cols, with=FALSE]))) # content match
# Fix for #1078 and #1128
x = data.frame(x=1L, y=2L)
setattr(x, 'class', c("foo", "data.frame"))
test(1517.1, class(as.data.table(x)), c("data.table", "data.frame"))
test(1517.2, class(setDT(x)), c("data.table", "data.frame"))
x = data.table(x="a", y=2L)
setattr(x, 'class', c("foo", "data.table", "data.frame"))
test(1517.3, class(as.data.table(x)), c("data.table", "data.frame"))
test(1517.4, class(setDT(x)), c("data.table", "data.frame"))
# Fix for setattr, #1142
x = factor(rep(1:4, each=2L))
ax = address(x)
setattr(x, 'levels', c("a", "a", "b", "b"))
test(1518.1, levels(x), c("a", "b"))
test(1518.2, address(x), ax)
# Fix for #1074 and #1092
x = data.table(x=c(1,1,1,2), y=1:4, key="x")
test(1519.1, x[.(2:3), .N, nomatch=0L], 1L)
x = data.table(k = INT(0,2,3,7), o = "b", key = "k")
y = data.table(k = 1:5, n = paste("n", 1:5, sep=""), key = "k")
test(1519.2, x[y, o := n], data.table(k = INT(0,2,3,7), o = c("b","n2","n3","b"), key = "k"))
# Fix for #1141 (thanks to @yvanrichard)
x <- data.table(zxc = 1:3, vbn = 4:6)
test(1520, x[, c('zxc', 'qwe', 'rty', 'vbn'), with = FALSE], error = "column(s) not found")
# Fix for #1154 (unnecessary lock on .SD)
x = data.table(a=c(1,1,2))[, unique(.SD)]
test(1521, x[, b := 5], data.table(a=c(1,2), b=5))
# Fix for #1160, fastmean retaining attributes
x = data.table(a = c(2,2,1,1,2), b=setattr(1:5, 'class', c('bla', 'integer')))
test(1522, class(x[, .(mean(b), all(b)), by=a]$V1), c('bla', 'integer'))
# Fix for #1145, .N lock handled properly
x = data.table(a=1:5)
test(1523, x[, head(.SD, n=2)[1:.N]], data.table(a=1:2))
# #637 add by.x and by.y to merge.data.table
d1 <- data.table(x1=c(1,3,8), y1=rnorm(3), key="x1")
d2 <- data.table(x2=c(3,8,10), y2=rnorm(3), key="x2")
ans1 = merge(d1, d2, by.x = "x1", by.y = "x2")
ans2 = setkey(setDT(merge.data.frame(d1, d2, by.x = key(d1), by.y = key(d2))), x1)
test(1524, ans1, ans2)
# 'unique =' argument for CJ, #1148
x = c(1, 2, 1)
y = c(5, 8, 8, 4)
w = c(10, 12, 12, 13) # already sorted but has dups; more efficient case to cover
options(datatable.CJ.names=FALSE)
test(1525.1, CJ(x, y, unique=TRUE), CJ(V1=c(1,2), V2=c(4,5,8)))
test(1525.2, CJ(x, z=y, unique=TRUE), ans<-data.table(V1=rep(c(1,2), each=3), z=c(4,5,8), key="V1,z")) # naming of one but not both, too
options(datatable.CJ.names=TRUE)
test(1525.3, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8)))
test(1525.4, CJ(x, z=y, unique=TRUE), setnames(ans,c("x","z")))
test(1525.5, CJ(x, w, unique=TRUE), data.table(x=(rep(c(1,2), each=3)), w=c(10,12,13), key="x,w"))
# `key` argument fix for `setDT` when input is already a `data.table`, #1169
DT <- data.table(A = 1:4, B = 5:8)
setDT(DT, key = "A")
test(1526.1, key(DT), "A")
test(1526.2, key(setDT(DT, key = NULL)), NULL)
# #501, fread stringsAsFactors=FALSE
dt = data.table(x=1:5, y = letters[1:5])
text = "x,y\n1,a\n2,b\n3,c\n4,d\n5,e\n"
test(1527.1, dt[, y := factor(y)], fread(text, stringsAsFactors=TRUE))
set.seed(1L)
dt = data.table(x=1:5, y = sample(letters[1:5]))
text = "x,y\n1,b\n2,e\n3,d\n4,c\n5,a\n"
test(1527.2, dt[, y := factor(y)], fread(text, stringsAsFactors=TRUE))
set.seed(1L)
dt = data.table(x=1:5, y = sample(letters[1:2], 5, TRUE))
text = "x,y\n1,a\n2,a\n3,b\n4,b\n5,a\n"
test(1527.3, dt[, y := factor(y)], fread(text, stringsAsFactors=TRUE))
# #1027, check.names argument to fread
nm1 = names(fread("a,a\n1,2\n3,4", check.names=FALSE))
nm2 = names(fread("a,a\n1,2\n3,4", check.names=TRUE))
nm3 = names(fread("a b,a b\n1,2\n3,4", check.names=TRUE))
test(1528.1, c("a", "a"), nm1)
test(1528.2, c("a", "a.1"), nm2)
test(1528.3, c("a.b", "a.b.1"), nm3)
# add tests for between
x = sample(10, 20, TRUE)
test(1529.01, between(x, 1L, 5L, TRUE), x >= 1L & x <= 5L)
test(1529.02, x %between% c(1L, 5L), x >= 1L & x <= 5L)
test(1529.03, between(x, 1L, 5L, FALSE), x > 1L & x < 5L)
x = sample(c(1:10, NA), 20, TRUE)
test(1529.04, between(x, 1L, 5L, TRUE), x >= 1L & x <= 5L)
test(1529.05, x %between% c(1L, 5L), x >= 1L & x <= 5L)
test(1529.06, between(x, 1L, 5L, FALSE), x > 1L & x < 5L)
x = runif(15)
test(1529.07, between(x, 0.25, 0.75, incbounds=TRUE), x >= 0.25 & x <= 0.75)
test(1529.08, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75)
test(1529.09, between(x, 0.25, 0.75, incbounds=FALSE), x > 0.25 & x < 0.75)
test(1529.10, between(x, 0.25, NA, NAbounds=NA), ifelse(x<=0.25, FALSE, NA))
test(1529.11, between(x, NA, 0.75, NAbounds=NA), ifelse(x>=0.75, FALSE, NA))
test(1529.12, between(x, NA, NA, NAbounds=NA), rep(NA, length(x)))
test(1529.13, between(x, NA, NA, NAbounds=TRUE), rep(TRUE, length(x)))
test(1529.14, between(x, x[3], NA, incbounds=FALSE, NAbounds=NA), ifelse(x<=x[3], FALSE, NA))
test(1529.15, between(x, x[3], NA, incbounds=TRUE, NAbounds=NA), ifelse(x<x[3], FALSE, NA))
test(1529.16, between(x, NA, x[9], incbounds=FALSE, NAbounds=NA), ifelse(x>=x[9], FALSE, NA))
test(1529.17, between(x, NA, x[9], incbounds=TRUE, NAbounds=NA), ifelse(x>x[9], FALSE, NA))
x = c(NA, runif(15), NA)
test(1529.18, between(x, 0.25, 0.75, incbounds=TRUE), x >= 0.25 & x <= 0.75)
test(1529.19, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75)
test(1529.20, between(x, 0.25, 0.75, incbounds=FALSE), x > 0.25 & x < 0.75)
# add tests for which.first and which.last
# which.first
test(1530.1, which.first(sample(5, 20, TRUE)), error = "x not boolean")
x <- sample(c(TRUE, FALSE), 20, TRUE)
test(1530.2, which.first(x), which(x)[1L])
# which.last
test(1530.3, which.last(1:5), error = "x not boolean")
test(1530.4, which.last(x), tail(which(x), 1L))
# test for like, %like%, %ilike%, %flike%
set.seed(2L)
x = apply(matrix(sample(letters, 12), nrow=2), 1, paste, collapse="")
y = factor(sample(c(letters[1:5], x), 20, TRUE))
xsub = substring(x, 1L, 1L)
test(1532.1, y %like% xsub[1L], grepl(xsub[1L], y))
test(1532.2, y %like% xsub[2L], grepl(xsub[2L], y))
test(1532.3, like(y, xsub[1L]), grepl(xsub[1L], y))
test(1532.4, like(y, xsub[2L]), grepl(xsub[2L], y))
## %ilike% and %flike% for #3333
x = c('HEY', 'hey', '()')
test(1532.5, like(x, 'hey', ignore.case = TRUE), c(TRUE, TRUE, FALSE))
test(1532.6, like(x, '()'), c(TRUE, TRUE, TRUE))
test(1532.7, like(x, '()', fixed = TRUE), c(FALSE, FALSE, TRUE))
test(1532.8, x %ilike% 'hey', c(TRUE, TRUE, FALSE))
test(1532.9, x %flike% '()', c(FALSE, FALSE, TRUE))
# coverage for setkey() to 100%
dt1 = data.table(x=sample(5), y=1:5, key="y")
dt2 = as.data.table(dt1); setattr(dt2, 'sorted', NULL)
test(1533.1, setkeyv(dt1, character(0)), dt2, warning = "cols is a character vector")
test(1533.2, setkeyv(dt1, "x", verbose=TRUE), setkey(dt2, x), output = "forder took")
# test 1534 removed as custom %+% has been removed
# test 1535 removed as internal trim() not used
# remaining test for covering duplicated.data.table
dt = data.table(x=1:5, y=6:10)
test(1536, duplicated(dt, incomparables=TRUE), error = "argument 'incomparables != FALSE'")
# test for covering melt 100%
test(1537 , names(melt(dt, id.vars=1L, variable.name = "x", value.name="x")), c("x", "x.1", "x.2"), output = "Duplicate column names")
# test for tables()
test(1538, tables(), output = "Total:")
# uniqueN not support list-of-list: reverted #1224
d1 <- data.table(a = 1:4, l = list(list(letters[1:2]),list(Sys.time()),list(1:10),list(letters[1:2])))
test(1539, d1[,uniqueN(l)], error = "x must be an atomic vector or data.frames/data.tables")
# feature #1130 - joins without setting keys
# can't test which=TRUE with DT1.copy's results..
set.seed(45L)
DT1 = data.table(x=sample(letters[1:3], 15, TRUE), y=sample(6:10, 15, TRUE),
a=sample(100, 15), b=runif(15))
DT2 = CJ(x=letters[1:3], y=6:10)[, mul := sample(20, 15)][sample(15L, 5L)]
DT3 = rbindlist(list(DT2, list(x="d", y=7L, mul=100L)))
DT3 = DT3[sample(nrow(DT3))]
# key on char column
DT1.copy = copy(DT1)
setkey(DT1.copy, x)
test(1540.01, DT1[DT2, on=c(x="x")], DT1.copy[DT2])
test(1540.02, DT1[DT2, on=c("x")], DT1.copy[DT2])
test(1540.03, DT1[DT2, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="x"), .SDcols=c("a", "b")],
DT1.copy[DT2, lapply(.SD, function(x) x * mul),
by=.EACHI, .SDcols=c("a", "b")])
test(1540.04, DT1[DT3, on=c(x="x")], DT1.copy[DT3])
test(1540.05, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="x"), .SDcols=c("a", "b")],
DT1.copy[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, .SDcols=c("a", "b")])
test(1540.06, DT1[DT3, on=c(x="x"), nomatch=0L], DT1.copy[DT3, nomatch=0L])
test(1540.07, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="x"), .SDcols=c("a", "b"), nomatch=0L],
DT1.copy[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, .SDcols=c("a", "b"), nomatch=0L])
test(1540.08, DT1[DT3, on=c(x="x"), roll=TRUE], DT1.copy[DT3, roll=TRUE])
test(1540.09, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="x"), .SDcols=c("a", "b"), roll=TRUE],
DT1.copy[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, .SDcols=c("a", "b"), roll=TRUE])
# key on integer col
DT1.copy = copy(DT1)
setkey(DT1.copy, y)
test(1540.10, DT1[DT2, on=c(y="y")], DT1.copy[DT2[, c(2,1,3), with=FALSE]])
test(1540.11, DT1[DT2, on=c("y")], DT1.copy[DT2[, c(2,1,3), with=FALSE]])
test(1540.12, DT1[DT2, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(y="y"), .SDcols=c("a", "b")],
DT1.copy[DT2[, c(2,1,3), with=FALSE], lapply(.SD, function(x) x * mul),
by=.EACHI, .SDcols=c("a", "b")])
test(1540.13, DT1[DT3, on=c(y="y")], DT1.copy[DT3[, c(2,1,3), with=FALSE]])
test(1540.14, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(y="y"), .SDcols=c("a", "b")],
DT1.copy[DT3[, c(2,1,3), with=FALSE], lapply(.SD, function(x) x * mul),
by=.EACHI, .SDcols=c("a", "b")])
test(1540.15, DT1[DT3, on=c(y="y"), nomatch=0L], DT1.copy[DT3[, c(2,1,3), with=FALSE], nomatch=0L])
test(1540.16, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(y="y"), .SDcols=c("a", "b"), nomatch=0L],
DT1.copy[DT3[, c(2,1,3), with=FALSE], lapply(.SD, function(x) x * mul),
by=.EACHI, .SDcols=c("a", "b"), nomatch=0L])
test(1540.17, DT1[DT3, on=c(y="y"), roll=TRUE], DT1.copy[DT3[, c(2,1,3), with=FALSE], roll=TRUE])
test(1540.18, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(y="y"), .SDcols=c("a", "b"), roll=TRUE],
DT1.copy[DT3[, c(2,1,3), with=FALSE], lapply(.SD, function(x) x * mul),
by=.EACHI, .SDcols=c("a", "b"), roll=TRUE])
# multiple keys
DT1.copy = copy(DT1)
setkey(DT1.copy, x, y)
test(1540.19, DT1[DT2, on=c(x="x", y="y")], DT1.copy[DT2])
test(1540.20, DT1[DT2, on=c("x", "y")], DT1.copy[DT2])
test(1540.21, DT1[DT2, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="x", y="y")],
DT1.copy[DT2, lapply(.SD, function(x) x * mul), by=.EACHI])
test(1540.22, DT1[DT3, on=c(x="x", y="y")], DT1.copy[DT3])
test(1540.23, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="x", y="y")],
DT1.copy[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI])
test(1540.24, DT1[DT3, on=c(x="x", y="y"), nomatch=0L], DT1.copy[DT3, nomatch=0L])
test(1540.25, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="x", y="y"), nomatch=0L],
DT1.copy[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, nomatch=0L])
test(1540.26, DT1[DT3, on=c(x="x", y="y"), roll=TRUE], DT1.copy[DT3, roll=TRUE])
test(1540.27, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="x", y="y"), roll=TRUE],
DT1.copy[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, roll=TRUE])
# multiple keys, non-identical names
DT1.copy = copy(DT1)
setkey(DT1.copy, x, y)
setnames(DT2, c("q", "r", "mul"))
setnames(DT3, names(DT2))
test(1540.28, DT1[DT2, on=c(x="q", y="r")], DT1.copy[DT2])
test(1540.29, DT1[DT2, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="q", y="r")],
DT1.copy[DT2, lapply(.SD, function(x) x * mul), by=.EACHI])
test(1540.30, DT1[DT3, on=c(x="q", y="r")], DT1.copy[DT3])
test(1540.31, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="q", y="r")],
DT1.copy[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI])
test(1540.32, DT1[DT3, on=c(x="q", y="r"), nomatch=0L], DT1.copy[DT3, nomatch=0L])
test(1540.33, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="q", y="r"), nomatch=0L],
DT1.copy[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, nomatch=0L])
test(1540.34, DT1[DT3, on=c(x="q", y="r"), roll=TRUE], DT1.copy[DT3, roll=TRUE])
test(1540.35, DT1[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, on=c(x="q", y="r"), roll=TRUE],
DT1.copy[DT3, lapply(.SD, function(x) x * mul),
by=.EACHI, roll=TRUE])
## more coverage tests for by = .EACHI, on = c(LHS = 'RHS') for numeric type
set.seed(45L)
DT1 = data.table(x=sample(letters[1:3], 15, TRUE), y=sample(6:10, 15, TRUE),
a=sample(100, 15), b=runif(15))
DT2 = CJ(x=letters[1:3], y=6:10)[, mul := sample(20, 15)][sample(15L, 5L)]
DT3 = rbindlist(list(DT2, list(x="d", y=7L, mul=100L)))
DT3 = DT3[sample(nrow(DT3))]
DT1[ , x_num := match(x, letters) + .1]
DT3[ , x_num := match(x, letters) + .1]
test(1540.36, DT1[DT3[1:3], .(y = x_num), by=.EACHI, on=c(x_num="x_num")],
data.table(x_num = c(3.1, 4.1, 3.1), y = c(2.1, NA, NA)))
# to do: add tests for :=
# fix for #477, key not being retained on joins on factor columns
set.seed(1)
dtp <- data.table(pid = gl(3, 3, labels = c("du", "i", "nouana")),
year = gl(3, 1, 9, labels = c("2007", "2010", "2012")),
val = rnorm(9), key = c("pid", "year"))
dtab <- data.table(pid = factor(c("i", "nouana")),
year = factor(c("2010", "2000")),
abn = sample(1:5, 2, replace = TRUE), key =
c("pid", "year"))
test(1541, key(dtp[dtab]), c("pid", "year"))
# fix DT[TRUE, :=] using too much working memory for i, #1249
if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled
f = tempfile()
N = 1000000 # or any large number of rows
DT = data.table(A=1:N, B=rnorm(N))
DT[TRUE, B := B * 2] # stabilize with initial dummy update
Rprofmem(f)
DT[TRUE, B := B * 2] # or some in-place update
Rprofmem(NULL)
test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only
unlink(f)
}
# DT[TRUE] should shallow copy as v1.11.8 and earlier did (#3214); in future more will shallow copy too
DT = data.table(id = 1:5, key="id")
DT1 = DT[TRUE]
test(1542.01, address(DT1)!=address(DT))
test(1542.02, address(DT1$id)==address(DT$id))
test(1542.03, key(DT), "id")
test(1542.04, key(DT1), "id")
DT1[,newCol:=6:10]
test(1542.05, DT, data.table(id=1:5, key="id"))
test(1542.06, DT1, data.table(id=1:5, newCol=6:10, key="id"))
DT1[3, id:=6L]
test(1542.07, DT1, data.table(id=INT(1,2,6,4,5), newCol=6:10))
# current wrong behaviour (invalid key, #3215); root cause is the shallow exposed uniquely via DT[TRUE]
test(1542.08, DT$id, INT(1,2,6,4,5))
test(1542.09, key(DT), "id")
# future correct behaviour :
# test(1542.10, DT$id, 1:5)
# test(1542.11, key(DT), "id")
# rest of #1130 - merge doesn't copy, instead uses joins without keys.
set.seed(1L)
d1 <- data.table(A = sample(letters[1:10]), X = 1:10, total = TRUE)
d2 <- data.table(A = sample(letters[5:14]), Y = 1:10, total = FALSE)
ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by="A"))
ans2 <- setDF(merge(setDT(d1), setDT(d2), by="A"))
test(1543.1, ans1, ans2)
ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), all=TRUE, by="A"))
ans2 <- setDF(merge(setDT(d1), setDT(d2), all=TRUE, by="A"))
test(1543.2, ans1, ans2)
# test duplicate name cases
setnames(d2, c("A", "Y"), c("B", "A"))
ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), by.x="B", by.y="A"))
ans2 <- setDF(merge(setDT(d2), setDT(d1), by.x="B", by.y="A"))
test(1543.3, ans1, ans2)
ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), all=TRUE, by.x="B", by.y="A"))
ans2 <- setDF(merge(setDT(d2), setDT(d1), all=TRUE, by.x="B", by.y="A"))
test(1543.4, ans1, ans2)
# test for sort=FALSE argument, #1282
set.seed(1L)
d1 <- data.table(A = sample(letters[1:10]), X = 1:10, total = TRUE)
d2 <- data.table(A = sample(letters[5:14]), Y = 1:10, total = FALSE)
test(1543.5, merge(setDT(d1), setDT(d2), by="A", sort=FALSE),
setDT(merge(setDF(d1), setDF(d2), by="A", sort=FALSE)))
# thinko in merge dupnames handling
dt1 = data.table(x=1:5, y1=2L, y2=3L)
dt2 = data.table(a=4:6, y2=TRUE, y1 = FALSE)
test(1543.6, setDF(merge(dt1, dt2, by.x="x", by.y="a")),
merge(as.data.frame(dt1), as.data.frame(dt2), by.x="x", by.y="a"))
# fix #1290, restore colorder before setting names
set.seed(1)
dt1 <- data.table(sex = rep(1:2, 5), group = rep(letters[1:5], 2),V1 = sample(1:10))
set.seed(2)
dt2 <- data.table(group = rep(letters[1:5], 2),sex = rep(1:2, 5),V2 = sample(1:10))
test(1543.7, setDF(merge(dt1, dt2, by = c("sex", "group"))),
merge(as.data.frame(dt1), as.data.frame(dt2), by=c("sex", "group")))
by.x = c("sex.1", "group.1")
by.y = c("sex.2", "group.2")
setnames(dt1, 1:2, by.x)
setnames(dt2, 1:2, rev(by.y))
test(1543.8, setDF(merge(dt1, dt2, by.x=by.x, by.y=by.y)),
merge(as.data.frame(dt1), as.data.frame(dt2), by.x=by.x, by.y=by.y))
# fix for #1258 (bug on .shallow - retains keys when it shouldn't)
# nice catch and excellent report from @and3k
x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L))
y <- data.table(a2 = 1:3)
setkey(y, a2)
setkey(x1, a1, a2)
test(1544.1, setDF(merge(x1, y)), merge(as.data.frame(x1), as.data.frame(y)))
test(1544.2, setDF(merge(x1, y, by="a2")), merge(as.data.frame(x1), as.data.frame(y), by="a2"))
# also test shallow here so as to catch future regressions
x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key="a1,a2")
test(1545.01, key(.shallow(x1, cols="a2")), NULL)
test(1545.02, key(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.03, key(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL)
test(1545.04, key(.shallow(x1, retain.key=TRUE)), key(x1))
test(1545.05, key(.shallow(x1, cols="a1", retain.key=TRUE)), "a1")
# tests for #2336. .shallow drops keys unnecessarily
test(1545.06, key(.shallow(x1, cols=c("a1", "a3"), retain.key=TRUE)), "a1")
test(1545.07, .shallow(x1, cols=c("a3", "a1"), retain.key=TRUE), .shallow(x1, cols=c("a3", "a1"), retain.key=TRUE))
test(1545.08, key(.shallow(x1, cols=c("a1", "a2", "a3"), retain.key=TRUE)), c("a1", "a2"))
test(1545.09, key(.shallow(x1, cols=c("a2", "a3"), retain.key=TRUE)), NULL)
test(1545.10, key(.shallow(x1, cols=c("a2"), retain.key=TRUE)), NULL)
test(1545.11, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL)
setkey(x1, NULL)
test(1545.12, key(.shallow(x1, retain.key=TRUE)), NULL)
test(1545.13, key(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.14, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), NULL)
test(1545.15, key(.shallow(x1, cols=c("a1", "a2"), retain.key=FALSE)), NULL)
x1 <- x1[0]
test(1545.16, key(.shallow(x1, retain.key=TRUE)), NULL)
test(1545.17, key(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.18, key(.shallow(x1, cols = c("a1"), retain.key=FALSE)), NULL)
test(1545.19, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), NULL)
setkey(x1, a1)
test(1545.20, key(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.21, key(.shallow(x1, cols = "a2", retain.key=FALSE)), NULL)
test(1545.22, key(.shallow(x1, retain.key=TRUE)), "a1")
test(1545.23, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), "a1")
test(1545.24, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL)
# tests for #2336. .shallow now retains indices as well
x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'),
a2 = c(1L, 1L, 1L, 2L, 2L, 2L),
a3 = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE))
setindex(x1, a1, a2, a3)
setindex(x1, a1, a3)
setindex(x1, a1, a2) ## index with length 0
test(1545.25, indices(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.26, indices(.shallow(x1, cols = "a2", retain.key=FALSE)), NULL)
test(1545.27, indices(.shallow(x1, retain.key=TRUE)), indices(x1))
test(1545.28, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a2__a3")], c("a1", "a2", "a3")), integer(0))
test(1545.29, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0))
test(1545.30, forderv(.shallow(x1, retain.key=TRUE), c("a1", "a2")), integer(0))
test(1545.31, indices(.shallow(x1, cols = "a1", retain.key=TRUE)), c("a1"))
test(1545.32, forderv(.shallow(x1, cols = "a1", retain.key=TRUE), c("a1")), integer(0))
test(1545.33, attributes(attr(.shallow(x1, cols = c("a1", "a2"), retain.key = TRUE), "index", exact = TRUE)), attributes(attr(.shallow(x1, cols = c("a2", "a1"), retain.key = TRUE), "index", exact = TRUE)))
test(1545.34, indices(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)), c("a1__a2"))
test(1545.35, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), c("a1", "a2")), integer(0))
test(1545.36, indices(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)), c("a1__a3", "a1"))
test(1545.37, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), c("a1")), integer(0))
test(1545.38, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0))
test(1545.39, indices(.shallow(x1, cols = c("a2", "a3"), retain.key=TRUE)), NULL)
test(1545.40, indices(.shallow(x1, cols = c("a3"), retain.key=TRUE)), NULL)
test(1545.41, .shallow(x1, cols = c("a1", "a2", "a3"), retain.key=TRUE), .shallow(x1, retain.key=TRUE))
# test for #1234
df1 = df2 = data.frame(cats = rep(c('', ' ', 'meow'), 5), stringsAsFactors = TRUE)
df2[grep("^[ ]*$", df2$cats), "cats"] = NA_integer_
test(1546, set(df1, grep("^[ ]*$", df1$cats), 1L, NA_integer_), df2)
# Add test for getdots() function (although it doesn't seem to be used anywhere)
foo <- function(x, y, ...) { getdots() }
test(1547, foo(1L, 5L, a=2L, "c"), c("2", "c"))
# Fix for encoding issues in windows, #563
f = testDir("issue_563_fread.txt")
ans1 <- fread(f, sep=",", header=TRUE)
ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8")
test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown")
test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8")
# #1167 print.data.table row id in non-scientific notation
DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5))
test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c"))
rm(DT)
# PR by @dselivanov
# fixes #504 - handle nastring while reading (without coercion to character)
# Note: this doesn't address cases like na.strings="-999" yet. See https://github.com/Rdatatable/data.table/pull/1236 for those examples.
K = 10L
nastrings = c('null', 'NULL', 'na', '_NA', 'NA', 'nan', 'Nan', 'NAN', 'NaN')
DT = data.table(int = 1:K,
char = sample(letters, size = K, replace = TRUE),
float = 1:K + 0.1,
bool = sample( c(TRUE, FALSE), K, replace = TRUE))
DT_NA = DT
for (j in seq_len( ncol(DT) )) {
set(x = DT_NA, i = j, j = j, value = NA)
}
for(k in seq_along(nastrings)) {
dt0 = copy(DT)
for (j in seq_len( ncol(DT) )) {
set(x = dt0, i = NULL, j = j, value = as.character(dt0[[j]]))
set(x = dt0, i = j, j = j, value = nastrings[[k]])
}
str = do.call(paste, c(dt0, collapse="\n", sep=","))
str = paste(paste(names(dt0), collapse=","), str, sep="\n")
DT_fread = fread(str, na.strings = nastrings, verbose = FALSE)
test(1550 + k * 0.1, DT_fread, DT_NA)
}
# FR #568
str = "a,b\n1.5,\"at the 5\" end of the gene.\""
test(1551.1, fread(str), data.table(a = 1.5, b = "at the 5\" end of the gene."), warning=w<-"resolved improper quoting")
#1256
str = "x,y\nx1,\"oops\" y1\n"
test(1551.2, fread(str), data.table(x = "x1", y = "\"oops\" y1"), warning=w)
str = "x,y\nx1,\"oops\" y1"
test(1551.3, fread(str), data.table(x = "x1", y = "\"oops\" y1"), warning=w)
#1077
str = '2,3\n""foo,bar'
test(1551.4, fread(str), data.table(V1=c("2","\"\"foo"), V2=c("3","bar")), warning=w)
#1079
str = 'L1\tsome\tunquoted\tstuff\nL2\tsome\t"half" quoted\tstuff\nL3\tthis\t"should work"\tok though'
test(1551.5, fread(str),
data.table(L1 = c("L2", "L3"), some = c("some", "this"), unquoted = c("\"half\" quoted", "should work"), stuff = c("stuff", "ok though")),
warning = w)
#1095
rhs = read.table(testDir("issue_1095_fread.txt.bz2"), sep=",", comment.char="", stringsAsFactors=FALSE, quote="", strip.white=TRUE)
if (test_R.utils) test(1551.6, fread(testDir("issue_1095_fread.txt.bz2"), logical01=FALSE), setDT(rhs), warning=w)
# FR #1314 rest of na.strings issue
str = "a,b,c,d\n#N/A,+1,5.5,FALSE\n#N/A,5,6.6,TRUE\n#N/A,+1,#N/A,-999\n#N/A,#N/A,-999,FALSE\n#N/A,1,NA,TRUE"
read_table = function(str, ...) {
setDT(read.table(text=str, stringsAsFactors=FALSE, comment.char="", sep=",", header=TRUE, ...))[]
}
test(1552.1, fread(str, na.strings="#N/A"), read_table(str, na.strings="#N/A"))
test(1552.2, fread(str, na.strings=c("#N/A", "-999")), read_table(str, na.strings=c("#N/A", "-999")))
test(1552.3, fread(str, na.strings=c("#N/A", "-999", "+1")), read_table(str, na.strings=c("#N/A", "-999", "+1")))
test(1552.4, fread(str, na.strings=c("#N/A", "-999", "+1", "1")),
error="NAstring <<1>> is recognized as type boolean.*not permitted")
test(1552.5, fread(str, na.strings=c("#N/A", "-999", "FALSE")), error="NAstring <<FALSE>>.*boolean.*not permitted")
test(1552.6, fread("A\n1.0\n2\n-", na.strings=c("-")), data.table(A=c(1.0, 2.0, NA)))
# FR #1177: 'quote' option of 'print.data.table'
DT1 <- data.table(s1=paste(" ",LETTERS[1:5],sep=""),s2=LETTERS[1:5])
ans1 <- c(" \"s1\" \"s2\"", "1: \" A\" \"A\"",
"2: \" B\" \"B\"", "3: \" C\" \"C\"",
"4: \" D\" \"D\"", "5: \" E\" \"E\"")
ans2 <- c(" s1 s2","1: A A","2: B B",
"3: C C","4: D D","5: E E")
test(1553.1, capture.output(print(DT1, quote = TRUE)), ans1)
test(1553.2, capture.output(print(DT1)), ans2)
# #826 - subset DT on single integer vector stored as matrix the same way as data.frame
dt <- data.table(a=letters[1:10])
idx <- c(2:4,7L,9:10)
dim(idx) <- c(6L, 1L)
dimnames(idx) <- list(NULL, "Resample1") # as in caret::createDataPartition
test(1554.1, dt[idx], data.table(a=letters[idx]))
test(1554.2, dt[-idx], data.table(a=letters[(1:10)[-idx]]))
test(1554.3, dt[!idx], data.table(a=letters[(1:10)[-idx]]))
test(1554.4, idx, structure(c(2L, 3L, 4L, 7L, 9L, 10L), .Dim = c(6L, 1L), .Dimnames = list(NULL, "Resample1")))
if (test_R.utils) {
# strip.white and other enhancements to 'fread()'
# bug #1113
ans1 <- fread(testDir("issue_1113_fread.txt.bz2"))
# some inconsistency by R version on whether the last column
# () gets read as numeric (which it is) or as factor,
# see discussion on issue #2484; not clear exactly what changed
# in R to fix this (or when), so just test is.character
# and force numeric instead of testing R version
ans2 <- read.table(testDir("issue_1113_fread.txt.bz2"), header=TRUE, stringsAsFactors = FALSE)
if (is.character(ans2$MCMCOBJ)) {
ans2$MCMCOBJ = as.numeric(ans2$MCMCOBJ)
}
setDT(ans2)
setnames(ans2, names(ans1))
test(1555.01, ans1, ans2)
}
# bug #1035, take care of spaces automatically. Note that the columns are also read in proper types. Also with quotes when sep is not space.
str1=" ITERATION THETA1 THETA2
2 3.95527E+01 2.10651E+01"
str2=" ITERATION, THETA1, THETA2
2, 3.95527E+01, 2.10651E+01"
str3=" ITERATION , THETA1 , THETA2
2 , 3.95527E+01 , 2.10651E+01"
str4=" ITERATION , THETA1 , \"THETA2\"
2 , 3.95527E+01 , 2.10651E+01"
str5=" ITERATION , THETA1 , THETA2
bla , 3.95527E+01 , 2.10651E+01"
test(1555.02, fread(str1), data.table(ITERATION=2L, THETA1=39.5527, THETA2=21.0651))
test(1555.03, fread(str2), data.table(ITERATION=2L, THETA1=39.5527, THETA2=21.0651))
test(1555.04, fread(str3), data.table(ITERATION=2L, THETA1=39.5527, THETA2=21.0651))
test(1555.05, fread(str4), data.table(ITERATION=2L, THETA1=39.5527, THETA2=21.0651))
test(1555.06, fread(str5), data.table(ITERATION="bla", THETA1=39.5527, THETA2=21.0651))
# without strip.white
# when sep==' ' as in str1, header col spaces should still be stripped even when strip.white=FALSE
test(1555.07, fread(str1, strip.white=FALSE), data.table(ITERATION=2L, THETA1=39.5527, THETA2=21.0651))
test(1555.08, names(fread(str2, strip.white=FALSE)), c(" ITERATION"," THETA1"," THETA2"))
test(1555.09, names(fread(str3, strip.white=FALSE)), c(" ITERATION "," THETA1 "," THETA2"))
test(1555.10, names(fread(str4, strip.white=FALSE)), c(" ITERATION "," THETA1 "," \"THETA2\""))
# bug #1035, reply to the post from another user
str1=" 22 4 6 4\n 34 22 34 5\n 6 2 1 4\n"
str2="22 4 6 4\n34 22 34 5\n6 2 1 4\n"
test(1555.11, fread(str1), fread(str2))
if (test_R.utils) {
# bug #785
rhs <- setDT(read.table(testDir("issue_785_fread.txt.gz"), header=TRUE, stringsAsFactors=FALSE, sep="\t", strip.white=TRUE))
test(1555.12, fread(testDir("issue_785_fread.txt.gz"), logical01=FALSE), rhs)
}
# bug #529, http://stackoverflow.com/questions/22229109/r-data-table-fread-command-how-to-read-large-files-with-irregular-separators
str1=" YYYY MM DD HH mm 19490 40790
1991 10 1 1 0 1.046465E+00 1.568405E+00"
str2="YYYY MM DD HH mm 19490 40790
1991 10 1 1 0 1.046465E+00 1.568405E+00"
test(1555.13, fread(str1), fread(str2))
# fix for #1330
test(1556.1, fread(testDir("issue_1330_fread.txt"), nrows=2L), ans<-data.table(a=1:2, b=1:2))
test(1556.2, fread(testDir("issue_1330_fread.txt"), nrows=3L), ans, warning=w<-"Stopped early on line 4. Expected 2.*found 0.*First discarded non-empty line: <<3.*3>>")
test(1556.3, fread(testDir("issue_1330_fread.txt"), nrows=4L), ans, warning=w)
# FR #768
str="1,2\n3,4\n"
test(1557.1, names(fread(str)), c("V1", "V2")) # autonamed
test(1557.2, names(fread(str, col.names=letters[1:2])), letters[1:2])
test(1557.3, names(fread(str, col.names=letters[1])), error="Can't assign 1 names to")
test(1557.4, names(fread(str, col.names=letters[1:3])), error="Can't assign 3 names to")
test(1557.5, names(fread(str, col.names=1:2)), error="Passed a vector of type")
# Fix for #773
f = testDir("issue_773_fread.txt")
ans = data.table(AAA=INT(c(4,7,rep(1,17),31,21)),
BBB=INT(c(5,8,rep(2,17),32,22)),
CCC=INT(c(6,9,rep(3,17),33,23)))
test(1558.1, fread(f), ans, warning=w<-"Stopped early on line 23. Expected 3 fields but found 2[.].*First discarded non-empty line: <<ZZZ.*YYY>>")
test(1558.2, fread(f, nrows=21L), ans)
test(1558.3, fread(f, nrows=21L, fill=TRUE), ans)
test(1558.4, fread(f, nrows=22L), ans, warning=w)
test(1558.5, fread(f, nrows=22L, fill=TRUE), rbind(ans, list("ZZZ","YYY",NA)))
# FR # 1338 -- check.names argument of setDT
ans=data.table(X=1:3,"X.1"=1:3)
dt1<-data.table(X=1:3,X=1:3)
df1<-data.frame(X=1:3,X=1:3,check.names=FALSE)
ls1<-list("X"=1:3,"X"=1:3)
test(1559.1, setDT(dt1, check.names=TRUE), ans)
test(1559.2, setDT(df1, check.names=TRUE), ans)
test(1559.3, setDT(ls1, check.names=TRUE), ans)
# Fix #1140
test(1560.1, data.table(x=letters[1:5])[, 0, with=FALSE], null.data.table())
test(1560.2, data.table(x=letters[1:5])[, c(0,FALSE), with=FALSE], null.data.table())
# Fix for #1298
d = data.table(a = 1)
q = quote(.(a))
test(1561, d[, 1, by = eval(q)], d[, 1, by = .(a)])
# Fix for #1315
d = as.IDate(seq(as.Date("2015-01-01"), as.Date("2015-01-15"), by='1 day'))
test(1562.1, as.list(d), lapply(as.list(as.Date(d)), as.IDate))
test(1562.2, sapply(d, identity), as.integer(sapply(as.Date(d), identity)))
# old test 1563 was moved to melt section and is now 1035.22
# Fix for #1216, .SDcols and with=FALSE should evaluate within frame of 'x' only when it's of the form a:b; was test 1557
dt = data.table(index1=1:10, index2=10:1, index3=1, s=4, i=24)
i = 2L
test(1563.101, dt[, paste0("index", 1:i), with=FALSE], dt[, index1:index2, with=FALSE])
test(1563.102, dt[, paste0("index", 1:i), with=FALSE], dt[, 1:2, with=FALSE])
test(1563.103, dt[, 5:4, with=FALSE], dt[, i:s, with=FALSE])
test(1563.104, dt[, .SD, .SDcols=paste0("index", 1:i)], dt[, .SD, .SDcols=index1:index2])
# fix for #1354; was test 1558
test(1563.201, as.ITime(NA), setattr(NA_integer_, 'class', 'ITime'))
# fix for #1352; was test 1560
dt1 = data.table(a=1:5, b=6:10, c=11:15)
dt2 = data.table(a=3:6, b=8:11, d=1L)
by_cols = c(x="a", y="b")
test(1563.301, merge(dt1,dt2, by=by_cols, sort=FALSE), dt1[dt2, nomatch=0L, on=unname(by_cols)])
# FR #1353; was test 1561
DT = data.table(x=c(20,10,10,30,30,20), y=c("a", "a", "a", "b", "b", "b"), z=1:6)
test(1563.401, rowid(DT$x), as.integer(c(1,1,2,1,2,2)))
test(1563.402, rowidv(DT, cols="x"), as.integer(c(1,1,2,1,2,2)))
test(1563.403, rowid(DT$x, prefix="group"), paste("group", as.integer(c(1,1,2,1,2,2)), sep=""))
test(1563.404, rowid(DT$x, DT$y), as.integer(c(1,1,2,1,2,1)))
test(1563.405, rowidv(DT, cols=c("x","y")), as.integer(c(1,1,2,1,2,1)))
# convenient usage with dcast
test(1563.406, dcast(DT, x ~ rowid(x, prefix="group"), value.var="z"), data.table(x=c(10,20,30), group1=c(2L,1L,4L), group2=c(3L,6L,5L), key="x"))
# Fix for #1346; was test 1562
DT = data.table(id=1:3, g1=4:6, g2=7:9)
test(1563.501, melt(DT, measure.vars=patterns("^g[12]"), variable.factor=FALSE), data.table(id=1:3, variable=rep(c("g1","g2"),each=3L), value=4:9))
# fix for #1341
dt <- data.table(a = 1:10)
test(1564.1, truelength(dt[, .SD]), 1025L)
test(1564.2, truelength(dt[a==5, .SD]), 1025L)
test(1564.3, dt[a==5, .SD][, b := 1L], data.table(a=5L, b=1L))
# Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now
dt = data.table(a=sample(3,20,TRUE), b=1:10)
options(datatable.optimize = 0L)
test(1565.1, ans <- dt[, .N, by=a, verbose=TRUE], output="All optimizations are turned off")
options(datatable.optimize = 1L)
test(1565.2, dt[ , .N, by=a, verbose=TRUE], ans, output="lapply optimization is on, j unchanged")
options(datatable.optimize = Inf)
test(1565.3, dt[ , .N, by=a, verbose=TRUE], ans, output = "GForce optimized j to")
# Fix for #1212
set.seed(123)
dt <- data.table(a=c("abc", "def", "ghi"), b=runif(3))[, c:=list(list(data.table(d=runif(1), e=runif(1))))]
test(1566.1, dt[, c], dt[, get("c")])
test(1566.2, dt[, .(c=c)], dt[, .(c=get("c"))])
test(1566.3, address(dt$c) == address(dt[, get("c")]), FALSE)
# Fix for #1207
d1 <- data.table(a = character(), b = list())
test(1567.1, d1[, b, by=a], d1)
test(1567.2, d1[, b, keyby=a], data.table(d1, key="a"))
# Fix for #1334
dt = data.table(x=ordered(rep(1:3,each=5)),y=ordered(rep(c("B","A","C"),5),levels=c("B","A","C")),z=1:15)
test(1568, dt[, sum(z), keyby=.(I(x), I(y))], data.table(I=I(ordered(rep(1:3,each=3))), I.1=I(ordered(rep(c("B","A","C"),3),levels=c("B","A","C"))),V1=c(5L, 7L, 3L, 17L, 8L, 15L, 13L, 25L, 27L), key=c("I", "I.1")))
# Old tests 1569-71 were moved to melt section and are now 1035-37
# fix for #1378, merge retains class of first argument
X = data.table(a=1:3, b=4:6)
Y = data.table(a=1L, c=5L)
setattr(Y, 'class', c("custom","data.table","data.frame"))
test(1570.1, class(merge(X, Y, all=TRUE, by="a")), class(X))
test(1570.2, class(merge(Y, X, all=TRUE, by="a")), class(Y))
A = data.table(x = c(1, 2, 3), y = c(4, 5, 6))
B = data.table(x = c(1), w = c(5))
class(A) = c("custom", "data.table", "data.frame")
test(1570.3, class(merge(A, B, by="x")), class(A))
# #1379, tstrsplit gains names argument
X = data.table(a=c("ABC", "DEFG"))
test(1571.1, names(tstrsplit(X$a, "", fixed=TRUE, names=TRUE)), paste("V", 1:4, sep=""))
test(1571.2, names(tstrsplit(X$a, "", fixed=TRUE, names=letters[1:3])), error="is not equal to ")
test(1571.3, names(tstrsplit(X$a, "", fixed=TRUE, names=letters[1:4])), letters[1:4])
# tstrsplit also gains 'keep' argument
test(1571.4, tstrsplit(X$a, "", fixed=TRUE, keep=c(2,4)), list(c("B", "E"), c(NA, "G")))
test(1571.5, tstrsplit(X$a, "", fixed=TRUE, keep=c(2,7)), error="should contain integer")
test(1571.6, tstrsplit(X$a, "", fixed=TRUE, keep=c(2,4), names=letters[1:5]), error="is not equal to")
test(1571.7, tstrsplit(X$a, "", fixed=TRUE, names=1), error="'names' must be TRUE/FALSE or a character vector")
# fix for #1367, quote="" argument in use. Using embedded quotes in the example below reads the
# first two columns as one. I couldn't find a way to avoid introducing quote argument.
test(1572, fread('"abcd efgh." ijkl.\tmnop "qrst uvwx."\t45\n', quote=""),
setDT(read.table(text='"abcd efgh." ijkl.\tmnop "qrst uvwx."\t45\n', sep="\t", stringsAsFactors=FALSE, quote="")))
# Old tests 1573-74 moved to melt section and are now 1037.*
# Fix for #1384, fread with empty new line, initial checks failed due to extra spaces.
test(1573, fread('a,b
4,2
'), data.table(a=4L, b=2L))
# Fix for #1375
X = data.table(a=1:3,b=4:6,c=c("foo","bar","baz"))
test(1574.1, X[.(5), on="b"], X[2])
X = data.table(A=1:3,b=4:6,c=c("foo","bar","baz"))
Y = data.table(A=2:4, B=5:7)
test(1574.2, X[Y, on=c("A",b="B")], X[Y, on=c(A="A", b="B")])
test(1574.3, X[Y, on=c(b="B", "A")], X[Y, on=c(b="B", A="A")])
test(1574.4, X["bar", on="c"], X[2L]) # missed previously
# fix for #1376
X = data.table(a=1:3,b=4:6,c=c("foo","bar","baz"))
Y = data.table(A=2:4, B=5:7)
test(1575.1, X[Y, on=c(A="a")], error="specify non existing column*.*A") # does not report 'x' or 'i' anymore after switch to colnamesInt
test(1575.2, X[Y, on=c(a="a")], error="specify non existing column*.*a")
# work around for issue introduced in v1.9.4, #1396
X = data.table(x=5:1, y=6:10)
setattr(X, 'index', integer(0))
setattr(attr(X, 'index'), 'x', 5:1) # auto indexed attribute as created from v1.9.4
test(1576, X[, z := 1:5, verbose=TRUE],
output = "Dropping index 'x' as.*beginning of its name.*very likely created by v1.9.4 of data.table")
# fix for #1408
X = fread("a|b|c|d
this|is|row|1
this|is|row|2
this|NA|NA|3
this|is|row|4", stringsAsFactors = TRUE)
test(1577.1, is.na(X[3, b]), TRUE)
test(1577.2, levels(X$b), "is")
X = fread("a|b|c|d
this|NA|row|1
this|NA|row|2
this|NA|NA|3
this|NA|row|4", colClasses="character", stringsAsFactors = TRUE)
test(1577.3, levels(X$b), character(0))
# FR #530, skip blank lines
input = "Header not 2 columns\n\n1,3\n2,4"
test(1578.1, fread(input), data.table(V1=1:2, V2=3:4))
input = "a,b\n\n1,3\n2,4"
test(1578.2, fread(input), data.table(V1=1:2, V2=3:4)) # the block of 2x2 dominates the one line with sep in auto-removed header section
test(1578.3, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4))
input = "a,b\n\n\n1,3\n2,4"
test(1578.4, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4))
input = "a,b\n\n\n1,3\n\n2,4\n\n"
test(1578.5, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4))
f = testDir("530_fread.txt")
test(1578.6, fread(f, skip=47L, verbose=TRUE), data.table(V1=1:2, V2=3:4), output="Positioned on line 48 starting: <<a,b>>")
test(1578.7, fread(f, skip=49L), data.table(V1=1:2, V2=3:4))
test(1578.8, fread(f, skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4))
test(1578.9, fread(f, skip=48L), data.table(V1=1:2, V2=3:4)) # start on blank line 49 and skip="auto" to first data row on line 50
# gforce optimisations
dt = data.table(x = sample(letters, 300, TRUE),
i1 = sample(-10:10, 300, TRUE),
i2 = sample(c(-10:10, NA), 300, TRUE),
d1 = as.numeric(sample(-10:10, 300, TRUE)),
d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE)))
if (test_bit64) {
dt[, `:=`(d3 = as.integer64(sample(-10:10, 300, TRUE)))]
dt[, `:=`(d4 = as.integer64(sample(c(-10:10,NA), 300, TRUE)))]
}
# make sure gforce is on
options(datatable.optimize=2L)
# testing gforce::gmedian
test(1579.01, dt[, lapply(.SD, median), by=x],
dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x])
test(1579.02, dt[, lapply(.SD, median, na.rm=TRUE), by=x],
dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x])
test(1579.03, dt[, lapply(.SD, median), keyby=x],
dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x])
test(1579.04, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x],
dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x])
ans = capture.output(dt[, lapply(.SD, median), by=x, verbose=TRUE])
test(1579.05, any(grepl("GForce optimized", ans)), TRUE)
# testing gforce::ghead and gforce::gtail
# head(.SD, 1) and tail(.SD, 1) optimisation
test(1579.06, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x])
test(1579.07, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x])
test(1579.08, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x])
test(1579.09, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x])
test(1579.10, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x])
test(1579.11, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x])
test(1579.12, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x])
test(1579.13, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x])
test(1579.14, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x])
test(1579.15, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x])
test(1579.16, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x])
test(1579.17, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x])
test(1579.18, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x])
test(1579.19, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x])
test(1579.20, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x])
test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x])
# GForce _doesn't_ work when n > 1
test(1579.22, dt[ , tail(.SD, 2), by = x, verbose = TRUE], output = 'GForce FALSE')
mysub <- function(x, n) x[n]
test(1579.23, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x])
test(1579.24, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x])
test(1579.25, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x])
test(1579.26, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x])
test(1579.27, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x])
test(1579.28, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x])
test(1579.29, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x])
test(1579.30, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x])
ans = capture.output(dt[, .SD[2], by=x, verbose=TRUE])
test(1579.31, any(grepl("GForce optimized", ans)), TRUE)
options(datatable.optimize = Inf)
# test for #1419, rleid doesn't remove names attribute
x = c("a"=TRUE, "b"=FALSE)
nx = copy(names(x))
r = rleid(x)
test(1580, nx, names(x))
# FR #971, partly addressed (only subsets in 'i')
# make sure GForce kicks in and the results are identical
dt = dt[, .(x, d1, d2)]
options(datatable.optimize=1L)
test(1581.01, ans1 <- dt[x %in% letters[15:20],
c(.N, lapply(.SD, sum, na.rm=TRUE),
lapply(.SD, min, na.rm=TRUE),
lapply(.SD, max, na.rm=TRUE),
lapply(.SD, mean, na.rm=TRUE),
lapply(.SD, median, na.rm=TRUE)
), by=x, verbose=TRUE],
output = "(GForce FALSE)")
options(datatable.optimize=2L)
test(1581.02, ans2 <- dt[x %in% letters[15:20],
c(.N, lapply(.SD, sum, na.rm=TRUE),
lapply(.SD, min, na.rm=TRUE),
lapply(.SD, max, na.rm=TRUE),
lapply(.SD, mean, na.rm=TRUE),
lapply(.SD, median, na.rm=TRUE)
), by=x, verbose=TRUE],
output = "GForce optimized j")
test(1581.03, ans1, ans2)
# subsets in 'i' for head and tail
options(datatable.optimize=1L)
test(1581.04, ans1 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE],
output = "(GForce FALSE)")
options(datatable.optimize=2L)
test(1581.05, ans2 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE],
output = "GForce optimized j")
test(1581.06, ans1, ans2)
options(datatable.optimize=1L)
test(1581.07, ans1 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE],
output = "(GForce FALSE)")
options(datatable.optimize=2L)
test(1581.08, ans2 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE],
output = "GForce optimized j")
test(1581.09, ans1, ans2)
options(datatable.optimize=1L)
test(1581.10, ans1 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE],
output = "(GForce FALSE)")
options(datatable.optimize=2L)
test(1581.11, ans2 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE],
output = "GForce optimized j")
test(1581.12, ans1, ans2)
options(datatable.optimize = Inf)
# #3209 g[[
options(datatable.optimize=1L)
test(1581.13, ans1 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE],
output = "(GForce FALSE)")
options(datatable.optimize=Inf)
test(1581.14, ans2 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE],
output = "GForce optimized j")
test(1581.15, ans1, ans2)
# also, block for non-atomic input, #4159
dt = data.table(a=1:3)
dt[ , l := .(list(1, 2, 3))]
test(1581.16, dt[ , .(l = l[[1L]]), by=a, verbose=TRUE],
dt[ , l := unlist(l)], output='(GForce FALSE)')
# make sure not to apply when `[[` is applied to a nested call, #4413
DT = data.table(f1=c("a","b"), f2=c("x","y"))
l = list(a = c(x = "ax", y = "ay"), b = c(x = "bx", y = "by"))
test(1581.17, DT[ , as.list(l[[f1]])[[f2]], by=c("f1","f2")],
data.table(f1 = c("a", "b"), f2 = c("x", "y"), V1 = c("ax", "by")))
test(1581.18, DT[, v:=l[[f1]][f2], by=c("f1","f2")],
data.table(f1=c("a","b"), f2=c("x","y"), v=c("ax", "by")))
# When the object being [[ is in parent.frame(), not x,
# need eval to have enclos=parent.frame(), #4612
DT = data.table(id = c(1, 1, 2), value = c("a", "b", "c"))
DT0 = copy(DT)
fun = function (DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"]
fun(DT)
test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')])
# handle NULL value correctly #1429
test(1582, uniqueN(NULL), 0L)
# bug fix #1461
dt = data.table(x=c(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN))
# make sure gforce is on
options(datatable.optimize=Inf)
ans1 = suppressWarnings(dt[, base::min(y, na.rm=TRUE), by=x])
ans2 = suppressWarnings(dt[, base::max(y, na.rm=TRUE), by=x])
test(1583.1, dt[, min(y, na.rm=TRUE), by=x], ans1, warning="No non-missing values found")
test(1583.2, dt[, max(y, na.rm=TRUE), by=x], ans2, warning="No non-missing values found")
ans3 = suppressWarnings(dt[, base::min(y), by=x])
ans4 = suppressWarnings(dt[, base::max(y), by=x])
test(1583.3, dt[, min(y), by=x], ans3)
test(1583.4, dt[, max(y), by=x], ans4)
# Fixed a minor bug in fread when blank.lines.skip=TRUE
f1 <- function(x, f=TRUE, b=FALSE) fread(x, fill=f, blank.lines.skip=b, data.table=FALSE, logical01=FALSE)
f2 <- function(x, f=TRUE, b=FALSE) read.table(x, fill=f, blank.lines.skip=b, sep=",", header=TRUE, stringsAsFactors=FALSE)
test(1584.1, f1(testDir("fread_blank.txt"), f=FALSE, b=TRUE), f2(testDir("fread_blank.txt"), f=FALSE, b=TRUE))
test(1584.2, f1(testDir("fread_blank2.txt"), f=FALSE, b=TRUE), f2(testDir("fread_blank2.txt"), f=FALSE, b=TRUE))
test(1584.3, f1(testDir("fread_blank3.txt"), f=FALSE, b=TRUE), f2(testDir("fread_blank3.txt"), f=FALSE, b=TRUE))
# fread fill=TRUE, #536. Also takes care of #1124.
# the appended [-28,], [-(7:9),] and [-29,] remove the final all-NA rows due to repeated eol ending the file
test(1585.1, f1(testDir("536_fread_fill_1.txt")), f2(testDir("536_fread_fill_1.txt"))[-28,])
test(1585.2, f1(testDir("536_fread_fill_1.txt"), b=TRUE), f2(testDir("536_fread_fill_1.txt"), b=TRUE))
test(1585.3, f1(testDir("536_fread_fill_2.txt")), f2(testDir("536_fread_fill_2.txt")))
test(1585.4, f1(testDir("536_fread_fill_2.txt"), b=TRUE), f2(testDir("536_fread_fill_2.txt"), b=TRUE))
test(1585.5, f1(testDir("536_fread_fill_3_extreme.txt")), f2(testDir("536_fread_fill_3_extreme.txt"))[-9,])
test(1585.6, f1(testDir("536_fread_fill_3_extreme.txt"), b=TRUE), f2(testDir("536_fread_fill_3_extreme.txt"), b=TRUE))
# no warning about bumping type. when fill=TRUE, column type detection starts at first non-empty line (which makes sense).
test(1585.7, f1(testDir("536_fread_fill_4.txt")), f2(testDir("536_fread_fill_4.txt"))[-29,])
test(1585.8, f1(testDir("536_fread_fill_4.txt"), b=TRUE), f2(testDir("536_fread_fill_4.txt"), b=TRUE))
# fix for #721
text="x,y\n1,a\n2,b\n"
test(1586.1, fread(text, colClasses=c("integer", "factor")), data.table(x=1:2, y=factor(letters[1:2])))
test(1586.2, fread(text, colClasses=c(x="factor")), data.table(x=factor(1:2), y=letters[1:2]))
# FR #590
text="x,y\n2,a\n1,q\n3,c\n"
test(1587, fread(text, key="y"), setDT(fread(text), key="y"))
# fix for #1361
dt = data.table(i=1:10, f=as.factor(1:10))
test(1588.1, dt[f %in% 3:4], dt[3:4])
test(1588.2, dt[f == 3], dt[3])
test(1588.3, dt[3 == f], dt[3])
test(1588.4, dt[i == 3], dt[3])
# test for reallyReal RHS
test(1588.5, dt[i == 3.5], dt[0L])
test(1588.6, dt[2.4 == 3.5], dt[0L])
dt = data.table(ch=letters[1:6])
test(1588.7, dt[ch>"c"], dt[4:6]) # coverage of a return(NULL) in .prepareFastSubset
# data.table operates consistently independent of locale, but it's R that changes and is sensitive to it.
# Because keys/indexes depend on a sort order. If a data.table is stored on disk with a key
# created in a locale-sensitive order and then loaded by another R session in a different locale, the ability to reuse existing sortedness
# will break because the order would depend on the locale. Which is why data.table is deliberately C-locale only. For consistency and simpler
# internals for robustness to reduce the change of errors and to avoid that class of bug. It would be possible to have locale-sensitive keys
# and indexes but we've, so far, decided not to, for those reasons.
# R is usually started in the regional non-C locale; e.g. en_US.UTF-8 for Matt, en_IN for Jan (#2771) and English_United States.1252 on Windows in US (#2856)
oldlocale = Sys.getlocale()
ctype = Sys.getlocale("LC_CTYPE")
collate = Sys.getlocale("LC_COLLATE")
Sys.setlocale("LC_CTYPE","C")
Sys.setlocale("LC_COLLATE","C")
# Same as Set.locale("LC_ALL","C") but done like this because it's not possible to return LC_ALL to previous state (only to default)
# Both LC_CTYPE and LC_COLLATE need to be set (as more normally done with LC_ALL) before base::order changes behaviour in test 1590.4 and 1590.7
x1 = "fa\xE7ile"
Encoding(x1) = "latin1"
x2 = iconv(x1, "latin1", "UTF-8")
test(1590.01, identical(x1,x2))
test(1590.02, x1==x2)
test(1590.03, forderv( c(x2,x1,x1,x2)), integer()) # desirable consistent result given identical(x1, x2)
# ^^ data.table consistent over time regardless of which version of R or locale
baseR = base::order(c(x2,x1,x1,x2))
# Even though C locale and identical(x1,x2), base R<=4.0.0 considers the encoding too; i.e. orders the encoding together x2 (UTF-8) before x1 (latin1).
# Then around May 2020, R-devel (but just on Windows) started either respecting identical() like data.table has always done, or put latin1 before UTF-8.
# Jan emailed R-devel on 23 May 2020.
# We relaxed 1590.04 and 1590.07 (tests of base R behaviour) rather than remove them, PR#4492 and its follow-up. But these two tests
# are so relaxed now that they barely testing anything. It appears base R behaviour is undefined in this rare case of identical strings in different encodings.
test(1590.04, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4))
Encoding(x2) = "unknown"
test(1590.05, x1!=x2)
test(1590.06, forderv( c(x2,x1,x1,x2)), INT(1,4,2,3)) # consistent with Windows-1252 result, tested further below
baseR = base::order(c(x2,x1,x1,x2))
test(1590.07, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4))
Sys.setlocale("LC_CTYPE", ctype)
Sys.setlocale("LC_COLLATE", collate)
test(1590.08, Sys.getlocale(), oldlocale) # checked restored locale fully back to how it was before this test
# Now test default locale on all platforms: Windows-1252 on AppVeyor and win-builder, UTF-8 on Linux, and users running test.data.table() in their locale
x1 = "fa\xE7ile"
Encoding(x1) = "latin1"
x2 = iconv(x1, "latin1", "UTF-8")
test(1590.09, identical(x1,x2))
test(1590.10, x1==x2)
test(1590.11, forderv( c(x2,x1,x1,x2)), integer())
# don't test base R as it might change, and base is senstive to locale (tested in cran_release.cmd) ... test(1590.13, base::order(c(x2,x1,x1,x2)), 1:4)
Encoding(x2) = "unknown"
if (x1==x2) {
# Linux and Mac where locale is usually UTF8
# NB: x1==x2 is a condition in base R, independent of data.table
test(1590.12, forderv( c(x2,x1,x1,x2)), integer())
# don't test base ... test(1590.13, base::order(c(x2,x1,x1,x2)), 1:4)
} else {
# Windows-1252, #2856
test(1590.14, forderv( c(x2,x1,x1,x2)), INT(1,4,2,3))
# don't test base ... test(1590.15, base::order(c(x2,x1,x1,x2)), INT(1,4,2,3))
}
# #1432 test
list_1 = list(a = c(44,47), dens = c(2331,1644))
list_2 = list(a=66, dens= 1890)
list_3 = list(a=c(44,46,48,50), dens=c(8000,1452,1596,7521))
mylist = list(list_1, list_2, list_3)
setattr(mylist, 'names', c("ID_1","ID_2","ID_3"))
ans = data.table(id=rep(c("ID_1","ID_2","ID_3"), c(2,1,4)),
a=c(44,47,66,44,46,48,50),
dens=c(2331,1644,1890,8000,1452,1596,7521))
test(1591, rbindlist(mylist, idcol="id"), ans)
# FR #1443
DT <- data.table(x = 1:3, y = 4:6, z = 7:9)
test(1592.1, setnames(DT, -5, "bla"), error="'old' is length 3 but 'new' is length 1")
test(1592.2, names(setnames(DT, -1, c("m", "n"))), c("x", "m", "n"))
# fix for #1513
test(1593, CJ(c(1,2,2), c(1,2,3)), data.table(V1=rep(c(1,2), c(3,6)), V2=c(1,2,3,1,1,2,2,3,3), key=c("V1", "V2")))
# FR #523, var, sd and prod
options(datatable.optimize = Inf) # ensure gforce is on
DT = data.table(x=sample(5, 100, TRUE),
y1=sample(6, 100, TRUE),
y2=sample(c(1:10,NA), 100, TRUE),
z1=runif(100),
z2=sample(c(runif(10),NA,NaN), 100, TRUE))
test(1594.01, DT[, lapply(.SD, var, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::var, na.rm=FALSE), by=x])
test(1594.02, DT[, lapply(.SD, var, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::var, na.rm=TRUE), by=x])
test(1594.03, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gvar")
# coverage: default group .N=1 case
idx=DT[ , .I[1L], by=x]$V1
out=data.table(x=DT[(idx), x], V1=NA_real_)
test(1594.05, DT[(idx), var(y1), by=x], out)
test(1594.06, DT[(idx), var(y1, na.rm=TRUE), by=x], out)
test(1594.07, DT[(idx), var(z1), by=x], out)
test(1594.08, DT[(idx), var(z1, na.rm=TRUE), by=x], out)
test(1594.09, DT[, lapply(.SD, sd, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::sd, na.rm=FALSE), by=x])
test(1594.10, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x])
test(1594.11, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gsd")
test(1594.12, DT[, lapply(.SD, prod, na.rm=FALSE), by=x], DT[, lapply(.SD, base::prod, na.rm=FALSE), by=x])
test(1594.13, DT[, lapply(.SD, prod, na.rm=TRUE), by=x], DT[, lapply(.SD, base::prod, na.rm=TRUE), by=x])
test(1594.14, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gprod")
# FR #1517
dt1 = data.table(x=c(1,1,2), y=1:3)
dt2 = data.table(x=c(2,3,4), z=4:6)
test(1595, merge(dt1,dt2), merge(dt1,dt2, by="x"))
# FR 1512, drop argument for dcast.data.table
DT <- data.table(v1 = c(1.1, 1.1, 1.1, 2.2, 2.2, 2.2),
v2 = factor(c(1L, 1L, 1L, 3L, 3L, 3L), levels=1:3),
v3 = factor(c(2L, 3L, 5L, 1L, 2L, 6L), levels=1:6),
v4 = c(3L, 2L, 2L, 5L, 4L, 3L))
ans1 <- dcast(DT, v1+v2~v3, value.var="v4", drop=FALSE)
test(1596.1, dcast(DT, v1+v2~v3, value.var="v4", drop=c(FALSE, TRUE)), ans1[, -6, with=FALSE])
test(1596.2, dcast(DT, v1+v2~v3, value.var="v4", drop=c(TRUE, FALSE)), ans1[c(1,6)])
# bug fix #1495
dt = data.table(id=1:30, nn = paste0('A', 1:30))
smp = sample(30, size =10)
lgl = dt$id %in% smp
test(1597, dt[lgl, ], dt[id %in% smp])
# FR #643
vv = sample(letters[1:3], 10, TRUE)
test(1599.1, data.table(x=vv, y=1:10, stringsAsFactors=TRUE)$x, factor(vv))
vv = sample(c(letters[1:3], NA), 10, TRUE)
test(1599.2, data.table(x=vv, y=1:10, stringsAsFactors=TRUE)$x, factor(vv))
# bug #1477 fix
DT <- data.table(a = 0L:1L, b = c(1L, 1L))
test(1600.1, DT[ , lapply(.SD, function(x) if (all(x)) x)], data.table(b=c(1L, 1L)))
# this fix wasn't entirely nice as it introduced another issue.
# it's fixed now, but adding a test for that issue as well to catch it early next time.
set.seed(17022016L)
DT1 = data.table(id1 = c("c", "a", "b", "b", "b", "c"),
z1 = sample(100L, 6L),
z2 = sample(letters, 6L))
DT2 = data.table(id1=c("c", "w", "b"), val=50:52)
test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id1"]), c("id1", "val", "bla"))
# warn when merge empty data.table #597
test(1601.1, merge(data.table(a=1),data.table(a=1), by="a"), data.table(a=1, key="a"))
test(1601.2, tryCatch(merge(data.table(a=1),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'y' argument is 0 columns data.table.")
test(1601.3, tryCatch(merge(data.table(NULL),data.table(a=1), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'x' argument is 0 columns data.table.")
test(1601.4, tryCatch(merge(data.table(NULL),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'x' and 'y' arguments are 0 columns data.table.")
# fix for #1549
d1 <- data.table(v1=1:2,x=x)
d2 <- data.table(v1=3:4)
test(1603.1, rbindlist(list(d2, d1), fill=TRUE), rbindlist(list(d1,d2), fill=TRUE)[c(3:4, 1:2)])
# fix for #1440
DT = data.table(a=1:3, b=4:6)
myCol = "b"
test(1604, DT[,.(myCol),with=FALSE], error="When with=FALSE,")
# fix for segfault #1531
DT = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
test(1605, DT[order(-x, "D")], error="Column 2 is length 1 which differs")
# fix for #1503, fread's fill argument polishing
test(1606, fread("2,\n1,a,b", fill=TRUE), data.table(V1=2:1, V2=c("","a"), V3=c("","b")))
# fix for #1476
dt = data.table(resp=c(1:5))
wide = copy(list(metrics = dt))$metrics # copy here copies the list of data.table and therefore didn't overallocate before..
test(1607, wide[, id := .I], data.table(resp = 1:5, id = 1:5))
wide = copy(list(a = list(b = dt)))$a$b # check again on doubly nested list
test(1607.1, wide[, id := .I], data.table(resp = 1:5, id = 1:5))
# better fix for #1462, + improved error message (if this better fix fails)
# no need for quote="" and sep="\t"..
if (test_R.utils) test(1608, dim(fread(testDir('issue_1462_fread_quotes.txt.gz'), header=FALSE)), c(4L, 224L), warning="resolved improper quoting")
# fix for #1164
test(1609, fread(testDir("issue_1164_json.txt")), data.table(json1='{""f1"":""value1"",""f2"":""double quote escaped with a backslash [ \\"" ]""}', string1="string field"))
# set of enhancements to print.data.table for #1523
# dplyr-like column summary
icol = 1L:3L
Dcol = as.Date(paste0("2016-01-0", 1:3))
DT1 = data.table(lcol = list(list(1:3), list(1:3), list(1:3)),
icol, ncol = as.numeric(icol), ccol = c("a", "b", "c"),
xcol = as.complex(icol), ocol = factor(icol, ordered = TRUE),
fcol = factor(icol))
test(1610.1, capture.output(print(DT1, class=TRUE)),
c(" lcol icol ncol ccol xcol ocol fcol",
" <list> <int> <num> <char> <cplx> <ord> <fctr>",
"1: <list[1]> 1 1 a 1+0i 1 1",
"2: <list[1]> 2 2 b 2+0i 2 2",
"3: <list[1]> 3 3 c 3+0i 3 3"))
DT2 = data.table(
Dcol = as.Date('2016-01-01') + 0:2,
Pcol = as.POSIXct('2016-01-01 01:00:00', tz = 'UTC') + 86400L*(0:2),
gcol = TRUE, Icol = as.IDate(16801) + 0:2,
ucol = `class<-`(1:3, 'asdf')
)
test(1610.2, capture.output(print(DT2, class=TRUE)),
c(" Dcol Pcol gcol Icol ucol",
" <Date> <POSc> <lgcl> <IDat> <asdf>",
"1: 2016-01-01 2016-01-01 01:00:00 TRUE 2016-01-01 1",
"2: 2016-01-02 2016-01-02 01:00:00 TRUE 2016-01-02 2",
"3: 2016-01-03 2016-01-03 01:00:00 TRUE 2016-01-03 3"))
# fix for #833
l1 = list(a=seq_len(5), matrix(seq_len(25),ncol = 5, nrow = 5))
l2 = list(seq_len(5), matrix(seq_len(25),ncol = 5, nrow = 5))
test(1611.1, as.data.table(l1), setnames(setDT(as.data.frame(l1)), c("a", paste("V", 1:5, sep=""))))
test(1611.2, as.data.table(l2), setnames(setDT(as.data.frame(l2)), c("V1", "V1.1", paste("V", 2:5, sep=""))))
# fix for #646
# tz= is explicitly specified otherwise CRAN's solaris (both sparc and x86) fail. It may not be solaris per se
# but something related to the timezone of the two solaris machines. I guess one or the other of as.POSIXct or
# as.POSIXlt create the 'tzone' attribute differently for default tz="", just on solaris. I checked test.data.table
# already uses all.equal(), not identical(). So I don't think it is an accuracy problem. But could be wrong.
ll = list(a=as.POSIXlt("2015-01-01", tz='UTC'), b=1:5)
test(1612.1, as.data.table(ll), data.table(a=as.POSIXct("2015-01-01", tz='UTC'), b=1:5), warning="POSIXlt column type detected")
dt = data.table(d1="1984-03-17")
ans = data.table(d1="1984-03-17", d2=as.POSIXct("1984-03-17", tz='UTC'))
test(1612.2, dt[, d2 := strptime(d1, "%Y-%m-%d", tz='UTC')], ans, warning="strptime() usage detected and wrapped with as.POSIXct()")
ll = list(a=as.POSIXlt("2015-01-01"), b=2L)
test(1612.3, setDT(ll), error="Column 1 is of POSIXlt type")
# tests for all.equal.data.table #1106
# diff nrow
DT1 <- data.table(a = 1:4, b = letters[1:4])
DT2 <- data.table(a = c(1:4,4L), b = letters[c(1:4,4L)])
test(1613.01, all.equal(DT1, DT2), "Different number of rows")
# diff ncol
DT1 <- data.table(a = 1:4, b = letters[1:4])
DT2 <- data.table(a = 1:4)
test(1613.02, all.equal(DT1, DT2), c("Different number of columns", "Different column names"))
# diff colnames
DT1 <- data.table(a = 1:4, b = letters[1:4])
DT2 <- data.table(aa = 1:4, bb = letters[1:4])
test(1613.03, all.equal(DT1, DT2), "Different column names")
# diff column order
DT1 <- data.table(a = 1:4, b = letters[1:4])
DT2 <- data.table(b = letters[1:4], a = 1:4)
test(1613.04, all.equal(DT1, DT2), "Different column order")
test(1613.05, all.equal(DT1, DT2, ignore.col.order=TRUE), TRUE)
# diff row order
DT1 <- data.table(a = 1:4, b = letters[1:4])
DT2 <- data.table(a = 4:1, b = letters[4:1])
test(1613.06, all.equal(DT1, DT2), "Column 'a': Mean relative difference: 0.8")
test(1613.07, all.equal(DT1, DT2, ignore.row.order=TRUE), TRUE)
# diff column order and diff row order
DT1 <- data.table(a = 1:4, b = letters[1:4])
DT2 <- data.table(b = letters[4:1], a = 4:1)
test(1613.08, all.equal(DT1, DT2), "Different column order")
test(1613.09, all.equal(DT1, DT2, ignore.row.order=TRUE), "Different column order")
test(1613.10, all.equal(DT1, DT2, ignore.col.order=TRUE), "Column 'a': Mean relative difference: 0.8")
test(1613.11, all.equal(DT1, DT2, ignore.row.order=TRUE, ignore.col.order=TRUE), TRUE)
# non-overlapping duplicates
DT1 <- data.table(a = c(1:4,1:2), b = letters[c(1:4,1:2)])
DT2 <- data.table(a = c(1:4,3:4), b = letters[c(1:4,3:4)])
test(1613.12, all.equal(DT1, DT2), "Column 'a': Mean relative difference: 1.333333")
test(1613.13, all.equal(DT1, DT2, ignore.row.order=TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity")
# overlapping duplicates
DT1 <- data.table(a = c(1:4,1:2), b = letters[c(1:4,1:2)])
DT2 <- data.table(a = c(1:4,2:1), b = letters[c(1:4,2:1)])
test(1613.14, all.equal(DT1, DT2), "Column 'a': Mean relative difference: 0.6666667")
test(1613.15, all.equal(DT1, DT2, ignore.row.order=TRUE), TRUE)
# mixed overlapping duplicates
DT1 <- data.table(a = c(1:4,1:2), b = letters[c(1:4,1:2)])
DT2 <- data.table(a = c(1:4,2:3), b = letters[c(1:4,2:3)])
test(1613.16, all.equal(DT1, DT2, ignore.row.order = TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity")
# overlapping duplicates not equal in count
DT1 <- data.table(a = c(1:4, rep(1L,3), rep(2L,2)), b = letters[c(1:4, rep(1L,3), rep(2L,2))])
DT2 <- data.table(a = c(1:4, rep(1L,2), rep(2L,3)), b = letters[c(1:4, rep(1L,2), rep(2L,3))])
test(1613.17, all.equal(DT1, DT2, ignore.row.order = TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity")
# overlapping duplicates equal in count
DT1 <- data.table(a = c(1:4, 1L, 2L, 1L, 2L), b = letters[c(1:4, 1L, 2L, 1L, 2L)])
DT2 <- data.table(a = c(2L, 1L, 1L, 2L, 1:4), b = letters[c(2L, 1L, 1L, 2L, 1:4)])
test(1613.18, all.equal(DT1, DT2, ignore.row.order = TRUE), TRUE)
# subset with overlapping duplicates
DT1 <- data.table(a = c(1:3,3L), b = letters[c(1:3,3L)])
DT2 <- data.table(a = c(1:4), b = letters[c(1:4)])
test(1613.19, all.equal(DT1, DT2, ignore.row.order = TRUE), "Dataset 'target' has duplicate rows while 'current' doesn't")
# different number of unique rows
DT1 <- data.table(a = c(1:3,2:3), b = letters[c(1:3,2:3)])
DT2 <- data.table(a = c(1L,1:4), b = letters[c(1L,1:4)])
test(1613.20, all.equal(DT1, DT2, ignore.row.order = TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity")
test(1613.21, all.equal(DT2, DT1, ignore.row.order = TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity")
# test attributes: key
DT1 <- data.table(a = 1:4, b = letters[1:4], key = "a")
DT2 <- data.table(a = 1:4, b = letters[1:4])
test(1613.22, all.equal(DT1, DT2), "Datasets has different keys. 'target': a. 'current' has no key.")
test(1613.23, all.equal(DT1, DT2, check.attributes = FALSE), TRUE)
test(1613.24, all.equal(DT1, setkeyv(DT2, "a"), check.attributes = TRUE), TRUE)
# test attributes: index
DT1 <- data.table(a = 1:4, b = letters[1:4])
DT2 <- data.table(a = 1:4, b = letters[1:4])
setindexv(DT1, "b")
test(1613.25, all.equal(DT1, DT2), "Datasets has different indexes. 'target': b. 'current' has no index.")
test(1613.26, all.equal(DT1, DT2, check.attributes = FALSE), TRUE)
test(1613.27, all.equal(DT1, setindexv(DT2, "a")), "Datasets has different indexes. 'target': b. 'current': a.")
test(1613.28, all.equal(DT1, setindexv(DT2, "b")), "Datasets has different indexes. 'target': b. 'current': a, b.")
test(1613.29, all.equal(DT1, setindexv(setindexv(DT2, NULL), "b")), TRUE)
# test custom attribute
DT1 <- data.table(a = 1:4, b = letters[1:4])
DT2 <- data.table(a = 1:4, b = letters[1:4])
setattr(DT1, "custom", 1L)
test(1613.30, all.equal(DT1, DT2), "Datasets has different number of (non-excluded) attributes: target 3, current 2")
test(1613.31, all.equal(DT1, DT2, check.attributes = FALSE), TRUE)
setattr(DT2, "custom2", 2L)
test(1613.32, all.equal(DT1, DT2), "Datasets has attributes with different names: custom, custom2")
setattr(DT1, "custom2", 2L)
setattr(DT2, "custom", 0L)
test(1613.33, all.equal(DT1, DT2), paste0("Attributes: < Component ", dQuote("custom"), ": Mean relative difference: 1 >"))
setattr(DT2, "custom", 1L)
test(1613.34, all.equal(DT1, DT2), TRUE)
# trim.levels
dt1 <- data.table(A = factor(letters[1:10])[1:4]) # 10 levels
dt2 <- data.table(A = factor(letters[1:5])[1:4]) # 5 levels
test(1613.35, all.equal(dt1, dt2))
test(1613.36, !isTRUE(all.equal(dt1, dt2, trim.levels = FALSE)))
test(1613.37, !isTRUE(all.equal(dt1, dt2, trim.levels = FALSE, check.attributes = FALSE)))
test(1613.38, all.equal(dt1, dt2, trim.levels = FALSE, ignore.row.order = TRUE))
test(1613.39, length(levels(dt1$A)) == 10L && length(levels(dt2$A)) == 5L, TRUE) # dt1 and dt2 not updated by reference
# unsupported column types: list
dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = lapply(1:4, function(x) new.env()))
test(1613.40, all.equal(dt, dt), TRUE)
test(1613.41, all.equal(dt, dt, ignore.row.order = TRUE), error = "Datasets to compare with 'ignore.row.order' must not have unsupported column types: [list]")
# unsupported type in set-ops: complex, raw
dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = as.complex(1:4), V4 = as.raw(1:4), V5 = lapply(1:4, function(x) NULL))
test(1613.42, all.equal(dt, dt), TRUE)
test(1613.43, all.equal(dt, dt, ignore.row.order = TRUE), error = "Datasets to compare with 'ignore.row.order' must not have unsupported column types: [raw, complex, list]")
# supported types multi column test
dt = data.table(
V1 = 1:4,
V2 = as.numeric(1:4),
V3 = letters[rep(1:2, 2)],
V4 = factor(c("a","a","b","b")),
V5 = as.POSIXct("2016-03-05 12:00:00", origin="1970-01-01")+(1:4)*3600,
V6 = as.Date("2016-03-05", origin="1970-01-01")+(1:4)
)[, V7 := as.IDate(V6)
][, V8 := as.ITime(V5)]
test(1613.441, all.equal(dt, dt), TRUE)
test(1613.442, all.equal(dt, dt, ignore.row.order = TRUE), TRUE)
test(1613.443, all.equal(dt[c(1:4,1L)], dt[c(1:4,1L)]), TRUE)
test(1613.444, all.equal(dt[c(1:4,1L)], dt[c(1L,1:4)]), "Column 'V1': Mean relative difference: 0.6")
test(1613.445, all.equal(dt[c(1:4,1L)], dt[c(1L,1:4)], ignore.row.order = TRUE), TRUE)
test(1613.45, all.equal(dt[c(1:4,1:2)], dt[c(1L,1L,1:4)], ignore.row.order = TRUE), c("Both datasets have duplicate rows, they also have numeric columns, together with ignore.row.order this force 'tolerance' argument to 0", "Dataset 'current' has rows not present in 'target' or present in different quantity"))
test(1613.46, all.equal(dt[c(1:2,1:4,1:2)], dt[c(1:2,1:2,1:4)], ignore.row.order = TRUE), TRUE)
# supported type all.equal: integer64
if (test_bit64) {
dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = bit64::as.integer64("90000000000")+1:4)
test(1613.47, all.equal(dt, dt), TRUE)
test(1613.48, all.equal(dt, dt, ignore.row.order = TRUE), TRUE)
test(1613.49, all.equal(dt[c(1:4,1L)], dt[c(1:4,1L)]), TRUE)
test(1613.50, all.equal(dt[c(1:4,1L)], dt[c(1L,1:4)]), "Column 'V1': Mean relative difference: 0.6")
test(1613.51, all.equal(dt[c(1:4,1L)], dt[c(1L,1:4)], ignore.row.order = TRUE), TRUE)
test(1613.52, all.equal(dt[c(1:4,1:2)], dt[c(1L,1L,1:4)], ignore.row.order = TRUE), c("Both datasets have duplicate rows, they also have numeric columns, together with ignore.row.order this force 'tolerance' argument to 0","Dataset 'current' has rows not present in 'target' or present in different quantity"))
test(1613.53, all.equal(dt[c(1:2,1:4,1:2)], dt[c(1:2,1:2,1:4)], ignore.row.order = TRUE), TRUE)
}
# all.equal - new argument 'tolerance' #1737
x = data.table(1) # test numeric after adding 'tolerance' argument
y = data.table(2)
test(1613.5411, !isTRUE(all.equal(x, y, ignore.row.order = FALSE)))
test(1613.5412, !isTRUE(all.equal(x, y, ignore.row.order = TRUE)))
x = data.table(c(1,1))
y = data.table(c(2,2))
test(1613.5421, !isTRUE(all.equal(x, y, ignore.row.order = FALSE)))
test(1613.5422, !isTRUE(all.equal(x, y, ignore.row.order = TRUE)))
x = data.table(c(1,2))
y = data.table(c(2,2))
test(1613.5431, !isTRUE(all.equal(x, y, ignore.row.order = FALSE)))
test(1613.5432, !isTRUE(all.equal(x, y, ignore.row.order = TRUE)))
x = data.table(as.factor(1)) # test factor adding 'tolerance' argument
y = data.table(as.factor(2))
test(1613.5511, !isTRUE(all.equal(x,y)))
test(1613.5512, !isTRUE(all.equal(x, y, ignore.row.order = FALSE)))
test(1613.5513, !isTRUE(all.equal(x, y, ignore.row.order = TRUE)))
x = data.table(as.factor(c(1,1)))
y = data.table(as.factor(c(2,2)))
test(1613.5521, !isTRUE(all.equal(x, y, ignore.row.order = FALSE)))
test(1613.5522, !isTRUE(all.equal(x, y, ignore.row.order = TRUE)))
x = data.table(as.factor(c(1,2)))
y = data.table(as.factor(c(2,2)))
test(1613.5531, !isTRUE(all.equal(x, y, ignore.row.order = FALSE)))
test(1613.5532, !isTRUE(all.equal(x, y, ignore.row.order = TRUE)))
x = data.table(-0.000189921844659375) # tolerance in action
y = data.table(-0.000189921844655161)
test(1613.561, all(all.equal(x, y, ignore.row.order = FALSE), all.equal(x, y, ignore.row.order = TRUE)))
test(1613.562, all(is.character(all.equal(x, y, ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(x, y, ignore.row.order = TRUE, tolerance = 0))))
test(1613.563, all(
all.equal(rbind(x,y), rbind(y,y), ignore.row.order=FALSE),
all.equal(rbind(x,y), rbind(y,y), ignore.row.order=TRUE),
all.equal(rbind(y,y), rbind(x,y), ignore.row.order=TRUE)
))
test(1613.564, all(is.character(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = TRUE, tolerance = 0))))
test(1613.565, all(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE), is.character(r<-all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0
test(1613.566, all(all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = FALSE, tolerance = 0), all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = TRUE, tolerance = 0)))
test(1613.567, all(is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE, tolerance = 0))))
test(1613.571, all(all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = FALSE), is.character(r<-all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0
test(1613.572, all(all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = FALSE), all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE))) # x to x with factor equality
test(1613.573, all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE, tolerance = 1), error = "Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") # error due to provided non zero tolerance
test(1613.581, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = 1), all.equal(x, y, ignore.row.order = TRUE, tolerance = 1)))
test(1613.582, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = sqrt(.Machine$double.eps)/2), all.equal(x, y, ignore.row.order = TRUE, tolerance = sqrt(.Machine$double.eps)/2)), warning = "Argument 'tolerance' was forced")
# fix for #4042
test(1613.59, all.equal.data.table(1L, 2L), error = "is.data.table(target) is not TRUE")
test(1613.601, all.equal(data.table(a=1), data.frame(a=1)), "target is data.table, current is data.frame")
test(1613.602, all.equal(data.table(a=1), data.frame(a=1), check.attributes = FALSE))
test(1613.603, all.equal(data.table(a=1), list(a=1), check.attributes = FALSE))
test(1613.604, all.equal(data.table(a=1), 1, check.attributes = FALSE))
test(1613.605, all.equal(data.table(a=1), try(stop('this wont work'), silent = TRUE), check.attributes = FALSE), "target is data.table but current is not and failed to be coerced to it")
L1 = list(a = data.table(1), b = setattr("foo1613", "tbl", data.table(1)))
L2 = list(a = 1, b = setattr("foo1613", "tbl", 1))
test(1613.606, all(grepl("target is data.table, current is numeric", all.equal(L1, L2))))
as.data.table.foo1613 = function(x) { # test as.data.table coerce of 'current' argument
if (!length(x)) warning("empty foo1613")
as.data.table(unclass(foo1613))
}
registerS3method("as.data.table", "foo1613", as.data.table.foo1613)
foo1613 = structure(list(NULL), class="foo1613")
test(1613.607, all.equal(data.table(), foo1613, check.attributes=FALSE))
foo1613 = structure(list(), class="foo1613")
test(1613.608, all.equal(data.table(), foo1613, check.attributes=FALSE), warning="empty")
rm(as.data.table.foo1613, foo1613)
DT1 <- data.table(a = 1:4, b = letters[1:4], .seqn = 5L)
DT2 <- data.table(a = 4:1, b = letters[4:1], .seqn = 5L)
test(1613.61, all.equal(DT1, DT2, ignore.row.order = TRUE), error = "column named '.seqn'")
DT1[ , .seqn := NULL]
DT2[ , .seqn := NULL]
DT3 = DT1[c(1L, 1:3)]
test(1613.62, grepl("'target' has duplicate rows while 'current' doesn't",
all.equal(DT3, DT2, ignore.row.order = TRUE)))
### force numeric
DT3[ , a := a + .001]
test(1613.63, all.equal(DT3, DT3, ignore.row.order = TRUE, tolerance = .01),
error = 'Duplicate rows in datasets, numeric columns and ignore.row.order')
if (test_bit64) {
# fix for #1405, handles roll with -ve int64 values properly
dt = data.table(x=as.integer64(c(-1000, 0)), y=c(5,10))
val = c(-1100,-900,100)
ans = data.table(x=val)
test(1614.1, dt[.(val), roll=Inf, on="x"], ans[, y:=c(NA,5,10)])
test(1614.2, dt[.(val), roll=Inf, on="x", rollends=TRUE], ans[, y:=c(5,5,10)])
test(1614.3, dt[.(val), roll=-Inf, on="x"], ans[, y:=c(5,10,NA)])
test(1614.4, dt[.(val), roll=-Inf, on="x", rollends=TRUE], ans[, y:=c(5,10,10)])
}
# fix for #1571
x = data.table(c(1,1,2,7,2,3,4,4,7), 1:9)
y = data.table(c(2,3,4,4,4,5))
test(1615.1, x[!y, on="V1", mult="first"], data.table(V1=c(1,7), V2=INT(c(1,4))))
test(1615.2, x[!y, on="V1", mult="last"], data.table(V1=c(1,7), V2=INT(c(2,9))))
test(1615.3, x[!y, on="V1", mult="all"], data.table(V1=c(1,1,7,7), V2=INT(c(1,2,4,9))))
# fix for #1287 and #1271
set.seed(1L)
dt = data.table(a=c(1,1,2), b=sample(10,3), c=sample(10,3))
test(1616.1, dt[.(1:2), if (c-b > 0L) b, on="a", by=.EACHI, mult="first"], data.table(a=1:2, V1=c(3L,5L)))
test(1616.2, dt[.(1:2), if (c-b > 0L) b, on="a", by=.EACHI, mult="last"], data.table(a=2L, V1=5L))
test(1616.3, dt[.(1:2), c := if (c-b > 0L) b, by=.EACHI, mult="first", on="a"],
data.table(a=dt$a, b=dt$b, c=c(3L,2L,5L)) )
# fix for #1281
x <- 3 > 0
ans = setattr(copy(x), "foo", "bar")
test(1617, setattr(x, "foo", "bar"), ans, warning = "Input is a length=1 logical that")
# fix for #1445
test(1618.1, fread("a,c,b\n1,2,3", select=c("b", "c")), data.table(b=3L, c=2L))
test(1618.2, fread("a,c,b\n1,2,3", select=c("c", "b")), data.table(c=2L, b=3L))
test(1618.3, fread("a,c,b\n1,2,3", select=c(3,2)), data.table(b=3L, c=2L))
test(1618.4, fread("a,c,b\n1,2,3", select=c(2:3)), data.table(c=2L, b=3L))
test(1618.5, fread("a,c,b\n1,2,3", select=c("b", "c"), col.names=c("q", "r")), data.table(q=3L, r=2L))
test(1618.6, fread("a,c,b\n1,2,3", select=c("b", "z")), data.table(b=3L), warning="Column name 'z' not found.*skipping")
# Additional test for 1445 for non-monotonic integer select
select1618.8 <- c(4, 9, 8, 23, 1, 21, 5, 18, 11, 13)
test(1618.8, names(fread("a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z\na,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z", select = select1618.8)), letters[select1618.8])
# fix for #1270. Have been problems with R before vs after 3.1.0 here. But now ok in all R versions.
DT = data.table(x=1:2, y=5:6)
test(1619.1, DT[, .BY, by=x]$BY, as.list(1:2))
test(1619.2, DT[, bycol := .BY, by=x]$bycol, as.list(1:2))
# fix for #473
DT = data.frame(x=1, y=2)
setattr(DT, 'class', c('data.table', 'data.frame')) # simulates over-allocation lost scenario
if (!truelength(DT)) test(1620, truelength(as.data.table(DT)), 1026L)
# fix for #1116, (#1239 and #1201)
if (test_R.utils) {
test(1621.1, fread(testDir("issue_1116_fread_few_lines.txt.gz"), logical01=FALSE),
setDT(read.delim(testDir("issue_1116_fread_few_lines.txt.gz"), stringsAsFactors=FALSE, sep=",", check.names=FALSE)))
test(1621.2, fread(testDir("issue_1116_fread_few_lines_2.txt.gz"), logical01=FALSE),
setDT(read.delim(testDir("issue_1116_fread_few_lines_2.txt.gz"), stringsAsFactors=FALSE, sep=",", check.names=FALSE)))
}
# fix for #1573
ans1 = fread(testDir("issue_1573_fill.txt"), fill=TRUE, na.strings="")
ans2 = setDT(read.table(testDir("issue_1573_fill.txt"), header=TRUE, fill=TRUE, stringsAsFactors=FALSE, na.strings=""))
date_cols = c('SD2', 'SD3', 'SD4')
ans2[ , (date_cols) := lapply(.SD, as.IDate), .SDcols = date_cols]
test(1622.1, ans1, ans2)
test(1622.2, ans1, fread(testDir("issue_1573_fill.txt"), fill=TRUE, sep=" ", na.strings=""))
# fix for #989
# error_msg = if (base::getRversion() < "3.4") "can not be a directory name" else "does not exist"
# Until R v3.3, file.info("~") returned TRUE for isdir. This seems to return NA in current devel. However, it
# correctly identifies that "~" is not a file. So leads to another error message. So removing the error message
# so that it errors properly on both versions. This seems fine to me since we just need it to error. Tested.
test(1623, fread("~"), error="")
# testing print.rownames option, #1097 (part of #1523)
options(datatable.print.rownames = FALSE)
DT <- data.table(a = 1:3)
test(1624, capture.output(print(DT)), c(" a", " 1", " 2", " 3"))
options(datatable.print.rownames = TRUE)
# fix for #1575
text = "colA: dataA\ncolB: dataB\ncolC: dataC\n\nColA: dataA\nColB: dataB\nColC: dataC"
test(1625.1, fread(text, header=FALSE, sep=":", blank.lines.skip=TRUE, strip.white=FALSE),
setDT(read.table(text=text, header=FALSE, sep=":", blank.lines.skip=TRUE, stringsAsFactors=FALSE)))
test(1625.2, fread(text, header=FALSE, sep=":", blank.lines.skip=TRUE),
setDT(read.table(text=text, header=FALSE, sep=":", blank.lines.skip=TRUE, stringsAsFactors=FALSE, strip.white=TRUE)))
# set-operators #547
# setops basic check all
x = data.table(c(1,2,2,2,3,4,4))
y = data.table(c(2,3,4,4,4,5))
test(1626.01, fintersect(x, y), data.table(c(2,3,4))) # intersect
test(1626.02, fintersect(x, y, all=TRUE), data.table(c(2,3,4,4))) # intersect all
test(1626.03, fsetdiff(x, y), data.table(c(1))) # setdiff (except)
test(1626.04, fsetdiff(x, y, all=TRUE), data.table(c(1,2,2))) # setdiff all (except all)
test(1626.05, funion(x, y), data.table(c(1,2,3,4,5))) # union
test(1626.06, funion(x, y, all=TRUE), data.table(c(1,2,2,2,3,4,4,2,3,4,4,4,5))) # union all
test(1626.07, fsetequal(x, y), FALSE) # setequal
# setops check two cols
x = data.table(c(1,2,2,2,3,4,4), c(1,1,1,3,3,3,3))
y = data.table(c(2,3,4,4,4,5), c(1,1,2,3,3,3))
test(1626.08, fintersect(x, y), data.table(c(2,4), c(1,3))) # intersect
test(1626.09, fintersect(x, y, all=TRUE), data.table(c(2,4,4), c(1,3,3))) # intersect all
test(1626.10, fsetdiff(x, y), data.table(c(1,2,3), c(1,3,3))) # setdiff (except)
test(1626.11, fsetdiff(x, y, all=TRUE), data.table(c(1,2,2,3), c(1,1,3,3))) # setdiff all (except all)
test(1626.12, funion(x, y), data.table(c(1,2,2,3,4,3,4,5), c(1,1,3,3,3,1,2,3))) # union
test(1626.13, funion(x, y, all=TRUE), data.table(c(1,2,2,2,3,4,4,2,3,4,4,4,5), c(1,1,1,3,3,3,3,1,1,2,3,3,3))) # union all
test(1626.14, fsetequal(x, y), FALSE) # setequal
# setops on unique sets
x = unique(x)
y = unique(y)
test(1626.15, fintersect(x, y), data.table(c(2,4), c(1,3))) # intersect
test(1626.16, fintersect(x, y, all=TRUE), data.table(c(2,4), c(1,3))) # intersect all
test(1626.17, fsetdiff(x, y), data.table(c(1,2,3), c(1,3,3))) # setdiff (except)
test(1626.18, fsetdiff(x, y, all=TRUE), data.table(c(1,2,3), c(1,3,3))) # setdiff all (except all)
test(1626.19, funion(x, y), data.table(c(1,2,2,3,4,3,4,5), c(1,1,3,3,3,1,2,3))) # union
test(1626.20, funion(x, y, all=TRUE), data.table(c(1,2,2,3,4,2,3,4,4,5), c(1,1,3,3,3,1,1,2,3,3))) # union all
test(1626.21, fsetequal(x, y), FALSE) # setequal
# intersect precise duplicate handling
dt = data.table(a=1L)
test(1626.22, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,0)])), 0L)
test(1626.23, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,0)], all=TRUE)), 0L)
test(1626.24, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,1)])), 1L)
test(1626.25, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,1)], all=TRUE)), 1L)
test(1626.26, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,2)])), 1L)
test(1626.27, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,2)], all=TRUE)), 2L)
test(1626.28, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,3)])), 1L)
test(1626.29, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,3)], all=TRUE)), 3L)
test(1626.30, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,4)])), 1L)
test(1626.31, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,4)], all=TRUE)), 4L)
test(1626.32, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,5)])), 1L)
test(1626.33, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,5)], all=TRUE)), 4L)
# setdiff precise duplicate handling
dt = data.table(a=1L)
test(1626.34, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,0)])), 1L)
test(1626.35, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,0)], all=TRUE)), 4L)
test(1626.36, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,1)])), 0L)
test(1626.37, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,1)], all=TRUE)), 3L)
test(1626.38, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,2)])), 0L)
test(1626.39, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,2)], all=TRUE)), 2L)
test(1626.40, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,3)])), 0L)
test(1626.41, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,3)], all=TRUE)), 1L)
test(1626.42, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,4)])), 0L)
test(1626.43, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,4)], all=TRUE)), 0L)
test(1626.44, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,5)])), 0L)
test(1626.45, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,5)], all=TRUE)), 0L)
# unsupported type in set-ops: list (except UNION ALL)
dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = lapply(1:4, function(x) new.env()))
x = dt[c(2:4,2L,2L)]
y = dt[c(1:3,2L)]
test(1626.46, fintersect(x, y), error = "unsupported column type found in x or y: [list]")
test(1626.47, fintersect(x, y, all=TRUE), error = "unsupported column type found in x or y: [list]")
test(1626.48, fsetdiff(x, y), error = "unsupported column type found in x or y: [list]")
test(1626.49, fsetdiff(x, y, all=TRUE), error = "unsupported column type found in x or y: [list]")
test(1626.50, funion(x, y), error = "unsupported column type found in x or y: [list]")
test(1626.51, funion(x, y, all=TRUE), dt[c(2:4,2L,2L,1:3,2L)])
test(1626.52, fsetequal(x, y), error = "unsupported column type found in x or y: [list]")
test(1626.53, fsetequal(dt[c(1:2,2L)], dt[c(1:2,2L)]), error = "unsupported column type found in x or y: [list]")
# unsupported type in set-ops: complex, raw
dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = as.complex(1:4), V4 = as.raw(1:4), V5 = lapply(1:4, function(x) NULL))
x = dt[c(2:4,2L,2L)]
y = dt[c(1:3,2L)]
test(1626.54, fintersect(x, y), error = "unsupported column types found in x or y: [raw, complex, list]")
test(1626.55, fintersect(x, y, all=TRUE), error = "unsupported column types found in x or y: [raw, complex, list]")
test(1626.56, fsetdiff(x, y), error = "unsupported column types found in x or y: [raw, complex, list]")
test(1626.57, fsetdiff(x, y, all=TRUE), error = "unsupported column types found in x or y: [raw, complex, list]")
test(1626.58, funion(x, y), error = "unsupported column types found in x or y: [raw, complex, list]")
test(1626.59, funion(x, y, all=TRUE), error = "unsupported column types found in x or y: [raw, complex]") # no 'list' here which is supported for `all=TRUE`
test(1626.60, fsetequal(x, y), error = "unsupported column types found in x or y: [raw, complex, list]")
test(1626.61, fsetequal(dt[c(1:2,2L)], dt[c(1:2,2L)]), error = "unsupported column types found in x or y: [raw, complex, list]")
# supported types multi column test
dt = data.table(
V1 = 1:4,
V2 = as.numeric(1:4),
V3 = letters[rep(1:2, 2)],
V4 = factor(c("a","a","b","b")),
V5 = as.POSIXct("2016-03-05 12:00:00", origin="1970-01-01")+(1:4)*3600,
V6 = as.Date("2016-03-05", origin="1970-01-01")+(1:4)
)[, V7 := as.IDate(V6)
][, V8 := as.ITime(V5)]
x = dt[c(2:4,2L,2L)]
y = dt[c(1:3,2L)]
test(1626.62, fintersect(x, y), dt[2:3])
test(1626.63, fintersect(x, y, all=TRUE), dt[c(2:3,2L)])
test(1626.64, fsetdiff(x, y), dt[4L])
test(1626.65, fsetdiff(x, y, all=TRUE), dt[c(4L,2L)])
test(1626.66, funion(x, y), dt[c(2:4,1L)])
test(1626.67, funion(x, y, all=TRUE), dt[c(2:4,2L,2L,1:3,2L)])
test(1626.68, fsetequal(x, y), FALSE)
test(1626.69, fsetequal(dt[c(2:3,3L)], dt[c(2:3,3L)]), TRUE)
# supported type in set-ops: integer64
if (test_bit64) {
dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = bit64::as.integer64("90000000000")+1:4)
x = dt[c(2:4,2L,2L)]
y = dt[c(1:3,2L)]
test(1626.70, fintersect(x, y), dt[2:3])
test(1626.71, fintersect(x, y, all=TRUE), dt[c(2:3,2L)])
test(1626.72, fsetdiff(x, y), dt[4L])
test(1626.73, fsetdiff(x, y, all=TRUE), dt[c(4L,2L)])
test(1626.74, funion(x, y), dt[c(2:4,1L)])
test(1626.75, funion(x, y, all=TRUE), dt[c(2:4,2L,2L,1:3,2L)])
test(1626.76, fsetequal(x, y), FALSE)
test(1626.77, fsetequal(dt[c(2:3,3L)], dt[c(2:3,3L)]), TRUE)
}
# fix for #2968 fsetequal with all = FALSE should treat rows as set elements
x = data.table(c(1,2,2,2,3,4,4), c(1,1,1,3,3,3,3))
x2 = unique(x)
y = data.table(c(2,3,4,4,4,5), c(1,1,2,3,3,3))
test(1626.79, fsetequal(x, x2, all = FALSE), TRUE)
test(1626.80, fsetequal(x, y, all = FALSE), FALSE)
# unit test for #3133
DT = data.table(A=1)
test(1626.81, funion(DT, DT, all=TRUE), data.table(A=c(1, 1)))
x = data.table(c(1,2,2,2,3,4,4))
y = data.table(c(2,3,4,4,4,5))
# with .set_ops_arg_check, these tests will also cover the similar
# cases of fsetdiff, funion, fsetequal
test(1626.82, fintersect(x, y, all = 1+3i), error = "'all' should be logical")
test(1626.83, fintersect(x, y, all = c(TRUE, FALSE)), error = 'logical of length one')
setDF(x)
test(1626.84, fintersect(x, y), error = 'x and y must both be data.tables')
setDT(x)
setDF(y)
test(1626.85, fintersect(x, y), error = 'x and y must both be data.tables')
setDT(y)
y[ , b := 2]
test(1626.86, fintersect(x, y), error = 'x and y must have the same column names')
x[ , b := 2]
setcolorder(x, 2:1)
test(1626.87, fintersect(x, y), error = 'x and y must have the same column order')
setcolorder(x, 2:1)
x[ , b := .POSIXct(b)]
test(1626.88, fintersect(x, y), error = "Item 2 of x is 'POSIXct' but the corresponding item of y is 'numeric'")
x[ , b := NULL]
y[ , b := NULL]
setnames(x, '.seqn')
setnames(y, '.seqn')
test(1626.89, fintersect(x, y), error = "column named '.seqn'")
# empty x shortout
x = y = data.table(a = 1)
test(1626.90, fsetdiff(x[0L], y), x[0L])
# slightly relaxed .set_ops_arg_check.
# Since v1.12.4, i's type is retained which caused corpustools to fail via this fsetdiff.
DT = data.table(id=factor(c("a","b","b","c")), v=1:4, key="id")
test(1626.91, fsetdiff(DT, DT["b"]), DT[c(1,4)])
# fix for #1087 and #1465
test(1627.1, charToRaw(names(fread(testDir("issue_1087_utf8_bom.csv")))[1L]), as.raw(97L))
test(1627.2, names(fread(testDir("issue_1087_utf8_bom.csv"), verbose=TRUE))[1L], "a", output="UTF-8 byte order mark EF BB BF found")
test(1627.3, names(fread(testDir("gb18030.txt")))[1L], "x", warning="GB-18030 encoding detected")
test(1627.4, fread(testDir("utf16le.txt")), error="File is encoded in UTF-16")
test(1627.5, fread(testDir("utf16be.txt")), error="File is encoded in UTF-16")
# uniqueN gains na.rm argument, #1455
set.seed(1L)
dt = data.table(x=sample(c(1:3,NA),25,TRUE), y=sample(c(NA,"a", "b"), 25,TRUE), z=sample(2,25,TRUE))
test(1628.1, uniqueN(dt, by=1:2, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y)])))
test(1628.2, uniqueN(dt, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y,z)])))
test(1628.3, dt[, uniqueN(y, na.rm=TRUE), by=z], dt[, length(unique(na.omit(y))), by=z])
test(1628.4, dt[, uniqueN(.SD, na.rm=TRUE), by=z], dt[, nrow(na.omit(.SD[, .N, by=.(x,y)])), by=z])
# fix for long standing FR/bug, #495
# most likely I'm missing some tests, but we'll fix/add them as we go along.
dt = data.table(grp=c(2,3,3,1,1,2,3), v1=1:7, v2=7:1, v3=10:16)
test(1629.01, dt[, .SD*v1, .SDcols=v2:v3], dt[, .(v2=v2*v1, v3=v3*v1)])
test(1629.02, dt[, lapply(.SD, function(x) x*v1), .SDcols=v2:v3], dt[, .(v2=v2*v1, v3=v3*v1)])
test(1629.03, dt[, lapply(.SD, function(x) mean(x)*sum(v1)), .SDcols=v2:v3], data.table(v2=112, v3=364))
test(1629.04, dt[, c(sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(V1=28L, v2=4, v3=13))
test(1629.05, dt[, c(v1=sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(v1=28L, v2=4, v3=13))
test(1629.06, dt[, .(v1=sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(v1=28L, V2=list(4,13)))
test(1629.07, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3])
# add/update
dt2 = copy(dt)
test(1629.08, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)])
# grouping operations
options(datatable.optimize = 1L) # no gforce
test(1629.09, dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL])
ans1 = dt[, sum(v1), by=grp]
ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3]
test(1629.10, dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)])
test(1629.11, dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp],
dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp])
test(1629.12, dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp])
# gforce
options(datatable.optimize = Inf) # Inf
test(1629.13, dt[, c(v1=max(v1), lapply(.SD, min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp])
# even more complex, shouldn't run any optimisation
dt[, v4 := v1/2]
test(1629.14, dt[, c(.(v1=v1*min(v4)), lapply(.SD, function(x) x*max(v4))), by=grp, .SDcols=v2:v3],
dt[, .(v1=v1*min(v4), v2=v2*max(v4), v3=v3*max(v4)), by=grp])
test(1629.15, copy(dt)[, c("a", "b", "c") := c(min(v1), lapply(.SD, function(x) max(x)*min(v1))), by=grp, .SDcols=v3:v4],
copy(dt)[, c("a", "b", "c") := .(min(v1), max(v3)*min(v1), max(v4)*min(v1)), by=grp])
options(datatable.optimize = Inf)
# by=.EACHI and operations with 'i'
test(1629.16, dt[.(c(2,3)), c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=.EACHI, .SDcols=v2:v3, on="grp"],
dt[grp %in% 2:3, c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=grp, .SDcols=v2:v3])
test(1629.17, dt[.(2:3), c(sum(v1), lapply(.SD, function(x) mean(x)*v1)), .SDcols=v2:v3, on="grp"][order(V1,v2,v3)],
dt[grp %in% 2:3, c(sum(v1), lapply(.SD, function(x) mean(x)*v1)), .SDcols=v2:v3][order(V1,v2,v3)])
# #759, add new cols on :=
dt1 <- data.table(id = 1:2, x = 3:4)
dt2 <- data.table(id = 3:4, y = c(5,6))
# when updating using :=, nomatch = 0 or NA should make no difference i.e. new columns should always
# be added. Otherwise there's an inconsistent number of columns in result that depends on data.
ans = copy(dt1)[,z:=NA_real_] # NA_real_ because :=2 below is type double
test(1630.01, copy(dt1)[id>5, z:=2, nomatch=0L], ans, warning="ignoring nomatch")
test(1630.02, copy(dt1)[dt2, z:=2, on="id", nomatch=0L], ans, warning="ignoring nomatch")
test(1630.03, copy(dt1)[dt2, z:=y, on="id", nomatch=0L], ans, warning="ignoring nomatch")
test(1630.04, copy(dt1)[dt2, z:=y, on="id", by=.EACHI, nomatch=0L], ans, warning="ignoring nomatch")
test(1630.05, copy(dt1)[id>5, z:=2, nomatch=NA], ans, warning="ignoring nomatch")
test(1630.06, copy(dt1)[dt2, z:=2, on="id", nomatch=NA], ans, warning="ignoring nomatch")
test(1630.07, copy(dt1)[dt2, z:=y, on="id", nomatch=NA], ans, warning="ignoring nomatch")
test(1630.08, copy(dt1)[dt2, z:=y, on="id", by=.EACHI, nomatch=NA], ans, warning="ignoring nomatch")
test(1630.09, copy(dt1)[id>5, z:=2L, nomatch=0L], copy(dt1)[,z:=NA_integer_], warning="ignoring nomatch")
test(1630.10, copy(dt1)[id>5, z:=2L, nomatch=NA], copy(dt1)[,z:=NA_integer_], warning="ignoring nomatch")
# fix for #1268, on= retains keys correctly.
A = data.table(site=rep(c("A","B"), each=3), date=rep(1:3, times=2), x=rep(1:3*10, times=2), key="site,date")
B = data.table(x=c(10,20), y=c(100,200), key="x")
test(1631, key(A[B, on="x"]), NULL)
# fix for #1479, secondary keys are removed when necessary
TFvec = c(FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)
dt = data.table(a = rep(TFvec, 3), b = c("x", "y", "z"))
setindex(dt, a)
dt[, a := as.logical(sum(a)), by = b]
test(1632.1, names(attributes(attr(dt, 'index'))), NULL)
dt = data.table(a = rep(TFvec, 3), b = c("x", "y", "z"))
setindex(dt, b)
dt[, a := as.logical(sum(a)), by = b]
test(1632.2, names(attributes(attr(dt, 'index'))), "__b")
dt = data.table(a = rep(TFvec, 3), b = c("x", "y", "z"))
test(1632.3, copy(dt)[, c := !a, by=b], copy(dt)[, c := rep(!TFvec,3L)])
# by accepts colA:colB for interactive scenarios, #1395
dt = data.table(x=rep(1,18), y=rep(1:2, each=9), z=rep(1:3,each=6), a=rep(1:6, each=3))[, b := 6]
test(1633.1, dt[, sum(b), by=x:a], dt[, sum(b), by=.(x,y,z,a)])
test(1633.2, dt[, sum(b), by=y:a], dt[, sum(b), by=.(y,z,a)])
test(1633.3, dt[, sum(b), by=a:y], dt[, sum(b), by=.(a,z,y)])
test(1633.4, dt[, .SD, by=1:nrow(dt)], data.table(nrow=1:nrow(dt), dt)) # make sure this works
# reuse secondary indices
dt = data.table(x=sample(3, 10, TRUE), y=1:10)
v1 = capture.output(ans1 <- dt[.(3:2), on="x", verbose=TRUE])
setindex(dt, x)
v2 = capture.output(ans2 <- dt[.(3:2), on="x", verbose=TRUE])
test(1634.1, any(grepl("ad hoc", v1)), TRUE)
test(1634.2, any(grepl("existing index", v2)), TRUE)
# fread's fill argument detects separator better in complex cases as well, #1573
# if pasted to the console, these tests won't work. But do work when sourced as these are tabs not spaces in text
text = "a b c d e f g h i j k l\n1 P P;A;E; Y YW; H(). 1-3 pro\n2 Q9 a;a;a;a; YB YH; M(). 13 pn ba\n1 P3 P; Y Y; R(). 14 p\n53 P P6;B;D;0;5;a;X;a;4R; Y Y; H(). 13 pe e\n1 P P;O;O;a;a;a; HLA-A HLA-A;; H(). HcIha,A-n\n102 P P;O;P;P;P;P;P;P;a;a;a;a;a;a;a;a;a;a; H-A H-A;; H(). HcIha,A"
test(1635.1, ans1 <- fread(text, fill=TRUE), setDT(read.table(text=text, stringsAsFactors=FALSE, fill=TRUE, sep="\t", header=TRUE)))
text = "a b c d e\n1 P P;A;E; Y YW; H(). 1-3 pro\n2 Q9 a;a;a;a; YB YH; M(). 13 pn ba\n1 P3 P; Y Y; R(). 14 p\n53 P P6;B;D;0;5;a;X;a;4R; Y Y; H(). 13 pe e\n1 P P;O;O;a;a;a; HLA-A HLA-A;; H(). HcIha,A-n\n102 P P;O;P;P;P;P;P;P;a;a;a;a;a;a;a;a;a;a; H-A H-A;; H(). HcIha,A"
test(1635.2, fread(text, fill=TRUE), setnames(ans1[, 1:7], c(letters[1:5], paste("V", 6:7, sep=""))))
# testing function type in dt, #518
dt = data.table(x=1, y=sum)
test(1636.1, class(dt$y), "list")
test(1636.2, print(dt), output="1: 1 <function[1]>")
dt = data.table(x=1:2, y=sum)
test(1636.3, class(dt$y), "list")
test(1636.4, print(dt), output="2: 2 <function[1]>")
dt = data.table(x=1:2, y=c(sum, min))
test(1636.5, class(dt$y), "list")
test(1636.6, print(dt), output="2: 2 <function[1]>")
# #484 fix (related to #495 fix above)
dt = data.table(a = 1, b = 1)
test(1637.1, dt[, data.table(a, .SD), by = cumsum(a)], data.table(cumsum=1, a=1, b=1))
test(1637.2, dt[, data.table(a, .SD), by = cumsum(a), .SDcols=a:b], data.table(cumsum=1, a=1, a=1, b=1))
test(1637.3, dt[, data.table(a, .SD), by = a], data.table(a=1,a=1,b=1))
test(1637.4, dt[, data.table(b, .SD), by = cumsum(a)], data.table(cumsum=1, b=1, b=1))
test(1637.5, dt[, data.table(a, b), by = cumsum(a)], data.table(cumsum=1, a=1, b=1))
# when datatable.optimize<1, no optimisation of j should take place:
options(datatable.optimize=0L)
dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2))
test(1638, dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off")
options(datatable.optimize=Inf)
#1389 - split.data.table - big chunk of unit tests
set.seed(123)
dt = data.table(x1 = rep(letters[1:2], 6), x2 = rep(letters[3:5], 4), x3 = rep(letters[5:8], 3), y = rnorm(12))
dt = dt[sample(.N)]
df = as.data.frame(dt)
# - [x] split by factor the same as `split.data.frame` - `f` argument ----
test(1639.001, lapply(split(df, as.factor(1:2)), setDT), split(dt, as.factor(1:2))) # drop=FALSE on same factor
test(1639.002, lapply(split(df, as.factor(1:2), drop=TRUE), setDT), split(dt, as.factor(1:2), drop=TRUE)) # drop=TRUE on same factor
test(1639.003, lapply(split(df, as.factor(1:4)[3:2]), setDT), split(dt, as.factor(1:4)[3:2])) # drop=FALSE on same factor with empty levels
test(1639.004, lapply(split(df, as.factor(1:4)[3:2], drop=TRUE), setDT), split(dt, as.factor(1:4)[3:2], drop=TRUE)) # drop=TRUE on same factor with empty levels
test(1639.005, lapply(split(df, as.factor(1:12)), setDT), split(dt, as.factor(1:12))) # drop=FALSE factor length of nrow
test(1639.006, lapply(split(df, as.factor(1:12), drop=TRUE), setDT), split(dt, as.factor(1:12), drop=TRUE)) # drop=TRUE factor length of nrow
ord = sample(2:13)
test(1639.007, lapply(split(df, as.factor(1:14)[ord]), setDT), split(dt, as.factor(1:14)[ord])) # drop=FALSE factor length of nrow with empty levels
test(1639.008, lapply(split(df, as.factor(1:14)[ord], drop=TRUE), setDT), split(dt, as.factor(1:14)[ord], drop=TRUE)) # drop=TRUE factor length of nrow with empty levels
test(1639.009, lapply(split(df, list(as.factor(1:2), as.factor(3:2))), setDT), split(dt, list(as.factor(1:2), as.factor(3:2)))) # `f` list object drop=FALSE
test(1639.010, lapply(split(df, list(as.factor(1:2), as.factor(3:2)), drop=TRUE), setDT), split(dt, list(as.factor(1:2), as.factor(3:2)), drop=TRUE)) # `f` list object drop=TRUE
test(1639.011, split(dt, as.factor(integer())), error = "group length is 0 but data nrow > 0") # factor length 0L
test(1639.012, split(dt, as.factor(integer()), drop=TRUE), error = "group length is 0 but data nrow > 0")
test(1639.013, split(dt, as.factor(1:2)[0L]), error = "group length is 0 but data nrow > 0") # factor length 0L with empty levels
test(1639.014, split(dt, as.factor(1:2)[0L], drop=TRUE), error = "group length is 0 but data nrow > 0")
# - [x] edge cases for `f` argument ----
test(1639.015, split(df, as.factor(NA)), split(dt, as.factor(NA))) # factor NA
test(1639.016, split(df, as.factor(NA), drop=TRUE), split(dt, as.factor(NA), drop=TRUE))
test(1639.017, lapply(split(df, as.factor(1:2)[0L][1L]), setDT), split(dt, as.factor(1:2)[0L][1L])) # factor NA with empty levels
test(1639.018, split(df, as.factor(1:2)[0L][1L], drop=TRUE), split(dt, as.factor(1:2)[0L][1L], drop=TRUE))
test(1639.019, lapply(split(df, as.factor(c(1L,NA,2L))), setDT), split(dt, as.factor(c(1L,NA,2L)))) # factor has NA
test(1639.020, lapply(split(df, as.factor(c(1L,NA,2L)), drop=TRUE), setDT), split(dt, as.factor(c(1L,NA,2L)), drop=TRUE))
test(1639.021, lapply(split(df, as.factor(c(1L,NA,2:4))[1:3]), setDT), split(dt, as.factor(c(1L,NA,2:4))[1:3])) # factor has NA with empty levels
test(1639.022, lapply(split(df, as.factor(c(1L,NA,2:4))[1:3], drop=TRUE), setDT), split(dt, as.factor(c(1L,NA,2:4))[1:3], drop=TRUE))
test(1639.023, lapply(split(df, letters[c(1L,NA,2L)]), setDT), split(dt, letters[c(1L,NA,2L)])) # character as `f` arg
test(1639.024, lapply(split(df, letters[c(1L,NA,2L)], drop=TRUE), setDT), split(dt, letters[c(1L,NA,2L)], drop=TRUE))
test(1639.025, lapply(split(df, "z"), setDT), split(dt, "z")) # character as `f` arg, length 1L
test(1639.026, lapply(split(df, "z", drop=TRUE), setDT), split(dt, "z", drop=TRUE))
test(1639.027, lapply(split(df, letters[c(1L,NA)]), setDT), split(dt, letters[c(1L,NA)])) # character as `f` arg, length 1L of non-NA
test(1639.028, lapply(split(df, letters[c(1L,NA)], drop=TRUE), setDT), split(dt, letters[c(1L,NA)], drop=TRUE))
test(1639.029, lapply(split(df[0L,], "z"), setDT), split(dt[0L], "z")) # nrow 0, f length 1-2
test(1639.030, lapply(split(df[0L,], c("z1","z2")), setDT), split(dt[0L], c("z1","z2")))
test(1639.031, lapply(split(df[0L,], "z", drop=TRUE), setDT), split(dt[0L], "z", drop=TRUE))
test(1639.032, lapply(split(df[0L,], c("z1","z2"), drop=TRUE), setDT), split(dt[0L], c("z1","z2"), drop=TRUE))
test(1639.033, lapply(split(df[1L,], "z"), setDT), split(dt[1L], "z")) # nrow 1, f length 1-2
test(1639.034, lapply(suppressWarnings(split(df[1L,], c("z1","z2"))), setDT), suppressWarnings(split(dt[1L], c("z1","z2"))))
test(1639.035, lapply(split(df[1L,], "z", drop=TRUE), setDT), split(dt[1L], "z", drop=TRUE) )
test(1639.036, lapply(suppressWarnings(split(df[1L,], c("z1","z2"), drop=TRUE)), setDT), suppressWarnings(split(dt[1L], c("z1","z2"), drop=TRUE)))
test(1639.037, lapply(split(df[0L,], as.factor(NA_character_)), setDT), split(dt[0L], as.factor(NA_character_))) # nrow 0, f factor length 1L NA
test(1639.038, lapply(split(df[0L,], as.factor(NA_character_), drop=TRUE), setDT), split(dt[0L], as.factor(NA_character_), drop=TRUE))
test(1639.039, lapply(split(df[0L,], as.factor(1:2)[0L][1L]), setDT), split(dt[0L], as.factor(1:2)[0L][1L])) # nrow 0, f factor length 1L NA with empty levels
test(1639.040, lapply(split(df[0L,], as.factor(1:2)[0L][1L], drop=TRUE), setDT), split(dt[0L], as.factor(1:2)[0L][1L], drop=TRUE))
test(1639.041, lapply(split(df[0L,], as.factor(integer())), setDT), split(dt[0L], as.factor(integer()))) # nrow 0, f factor length 0L
test(1639.042, lapply(split(df[0L,], as.factor(integer()), drop=TRUE), setDT), split(dt[0L], as.factor(integer()), drop=TRUE))
test(1639.043, lapply(split(df[0L,], as.factor(1:2)[0L]), setDT), split(dt[0L], as.factor(1:2)[0L])) # nrow 0, f factor length 0L with empty levels
test(1639.044, lapply(split(df[0L,], as.factor(1:2)[0L], drop=TRUE), setDT), split(dt[0L], as.factor(1:2)[0L], drop=TRUE))
test(1639.045, lapply(split(df[0L,], as.factor(1:3)[c(2L,NA,3L)]), setDT), split(dt[0L], as.factor(1:3)[c(2L,NA,3L)])) # nrow 0, f factor with empty levels and NA
test(1639.046, lapply(split(df[0L,], as.factor(1:3)[c(2L,NA,3L)], drop=TRUE), setDT), split(dt[0L], as.factor(1:3)[c(2L,NA,3L)], drop=TRUE)) # nrow 0, f character length 1L NA
test(1639.047, lapply(split(df[0L,], NA_character_), setDT), split(dt[0L], NA_character_))
test(1639.048, lapply(split(df[0L,], NA_character_, drop=TRUE), setDT), split(dt[0L], NA_character_, drop=TRUE))
test(1639.049, lapply(split(df[0L,], letters[c(NA,1:3)]), setDT), split(dt[0L], letters[c(NA,1:3)])) # nrow 0, f length > 1L, with NA
test(1639.050, lapply(split(df[0L,], letters[c(NA,1:3)], drop=TRUE), setDT), split(dt[0L], letters[c(NA,1:3)], drop=TRUE))
# - [x] split by reference to column names - `by` - for factor column ----
fdt = dt[, c(lapply(.SD, as.factor), list(y=y)), .SDcols=x1:x3]
l = split(fdt, by = "x1", flatten=FALSE) # single col
test(1639.051, TRUE, all(is.list(l), identical(names(l), c("b","a")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L), sapply(l, ncol) == c(b=4L, a=4L)))
l = split(fdt, by = "x2", flatten=FALSE)
test(1639.052, TRUE, all(is.list(l), identical(names(l), c("d","e","c")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L), sapply(l, ncol) == c(d=4L, e=4L, c=4L)))
l = split(fdt, by = "x3", flatten=FALSE)
test(1639.053, TRUE, all(is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, is.data.table), sapply(l, nrow) == c(h=3L, f=3L, g=3L, e=3L), sapply(l, ncol) == c(h=4L, f=4L, g=4L, e=4L)))
l = split(fdt, by = c("x1","x2"), flatten=FALSE) # multi col
test(1639.054, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(b=c("d","e","c"), a=c("e","d","c"))),
sapply(l, sapply, nrow) == rep(2L, 6),
sapply(l, sapply, ncol) == rep(4L, 6)
))
l = split(fdt, by = c("x1","x3"), flatten=FALSE) # empty levels appears due subset x3 by x1 groups
test(1639.055, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(b=c("h","f","e","g"), a=c("g","e","f","h"))),
sapply(l, sapply, nrow) == rep(c(3L,3L,0L,0L), 2),
sapply(l, sapply, ncol) == rep(4L, 8)
))
l = split(fdt, by = c("x2","x3"), flatten=FALSE)
test(1639.056, TRUE, all(
is.list(l), identical(names(l), c("d","e","c")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(d=c("h","f","e","g"), e=c("h","f","g","e"), c=c("f","h","e","g"))),
sapply(l, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, ncol) == rep(4L, 4)
))
l = split(fdt, by = c("x1","x2","x3"), flatten=FALSE) # empty levels in x3 after subset are expanded
test(1639.057, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))),
sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6),
sapply(l, sapply, sapply, ncol) == rep(4L, 24)
))
l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE) # multi col rev
test(1639.058, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(4L, 8)
))
l = split(fdt, by = c("x3","x1"), flatten=FALSE) # x1 has empty levels after split on x3 first
test(1639.059, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(h=c("b","a"), f=c("b","a"), g=c("a","b"), e=c("a","b"))),
sapply(l, sapply, nrow) == rep(c(3L,0L), 4),
sapply(l, sapply, ncol) == rep(4L, 8)
))
l = split(fdt, by = c("x3","x2","x1"), drop = TRUE, flatten=FALSE)
test(1639.060, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, lapply, names), list(h=list(d=c("b"), e=c("b"), c=c("b")), f=list(e=c("b"), c=c("b"), d=c("b")), g=list(e=c("a"), d=c("a"), c=c("a")), e=list(e=c("a"), d=c("a"), c=c("a")))),
sapply(l, sapply, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, sapply, ncol) == rep(4L, 12)
))
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x3)) # split.data.frame match
test(1639.061, unlist(split(fdt, by = c("x1","x3"), sorted = TRUE, flatten=FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x3), drop=TRUE)
test(1639.062, unlist(split(fdt, by = c("x1","x3"), sorted = TRUE, drop=TRUE, flatten=FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE
fdt = dt[, .(x1 = as.factor(c(as.character(x1), "c"))[-13L], # empty levels in factor and drop=FALSE
x2 = as.factor(c("a", as.character(x2)))[-1L],
x3 = as.factor(c("a", as.character(x3), "z"))[c(-1L,-14L)],
y = y)]
l = split(fdt, by = "x1")
test(1639.063, TRUE, all(is.list(l), identical(names(l), c("b","a","c")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L, c=0L), sapply(l, ncol) == c(b=4L, a=4L, c=4L)))
l = split(fdt, by = "x2")
test(1639.064, TRUE, all(is.list(l), identical(names(l), c("d","e","c","a")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L, a=0L), sapply(l, ncol) == c(d=4L, e=4L, c=4L, a=4L)))
l = split(fdt, by = c("x3","x1"), flatten=FALSE)
test(1639.065, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e","a","z")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(h=c("b","a","c"), f=c("b","a","c"), g=c("a","b","c"), e=c("a","b","c"), a=c("a","b","c"), z=c("a","b","c"))),
sapply(l, sapply, nrow) == c(rep(c(3L,0L,0L), 4), rep(0L, 6)),
sapply(l, sapply, ncol) == rep(4L, 18)
))
l = split(fdt, by = "x1", drop=TRUE) # empty levels in factor and drop=TRUE
test(1639.066, TRUE, all(is.list(l), identical(names(l), c("b","a")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L), sapply(l, ncol) == c(b=4L, a=4L)))
l = split(fdt, by = "x2", drop=TRUE)
test(1639.067, TRUE, all(is.list(l), identical(names(l), c("d","e","c")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L), sapply(l, ncol) == c(d=4L, e=4L, c=4L)))
l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE)
test(1639.068, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(4L, 4)
))
l = split(fdt, by = c("x3","x1"), sorted=TRUE, flatten=FALSE) # test order for empty levels in factor and drop=FALSE
test(1639.069, TRUE, all(
is.list(l), identical(names(l), c("a","e","f","g","h","z")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), setNames(rep(list(c("a","b","c")), 6), c("a","e","f","g","h","z"))),
sapply(l, sapply, nrow) == c(0L,0L,0L,3L,0L,0L,0L,3L,0L,3L,0L,0L,0L,3L,0L,0L,0L,0L),
sapply(l, sapply, ncol) == rep(4L, 18)
))
l = split(fdt, by = c("x3","x1"), sorted=TRUE, drop=TRUE, flatten=FALSE) # test order for empty levels in factor and drop=TRUE
test(1639.070, TRUE, all(
is.list(l), identical(names(l), c("e","f","g","h")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(e=c("a"), f=c("b"), g=c("a"), h=c("b"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(4L, 4)
))
sdf = split(as.data.frame(fdt), list(fdt$x3, fdt$x1)) # split.data.frame match on by = 2L and empty levels, drop=FALSE
test(1639.071, unlist(split(fdt, by = c("x3","x1"), sorted=TRUE, flatten=FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT))
sdf = split(as.data.frame(fdt), list(fdt$x3, fdt$x1), drop=TRUE) # split.data.frame match on by = 2L and empty levels, drop=TRUE
test(1639.072, unlist(split(fdt, by = c("x3","x1"), sorted=TRUE, drop=TRUE, flatten=FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT))
# - [x] split by reference to column names - `by` - factor and character column ----
fdt = dt[, .(x1 = x1,
x2 = x2,
x3 = as.factor(x3),
y = y)]
l = split(fdt, by = c("x2","x3"), flatten=FALSE)
test(1639.073, TRUE, all(
is.list(l), identical(names(l), c("d","e","c")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(d=c("h","f","e","g"), e=c("h","f","g","e"), c=c("f","h","e","g"))),
sapply(l, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, ncol) == rep(4L, 4)
))
l = split(fdt, by = c("x1","x2","x3"), flatten=FALSE) # empty levels in x3 after subset on x1, x2
test(1639.074, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))),
sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6),
sapply(l, sapply, sapply, ncol) == rep(4L, 24)
))
l = split(fdt, by = c("x1","x2","x3"), drop=TRUE, flatten=FALSE)
test(1639.075, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, lapply, names), list(b=list(d=c("h","f"), e=c("h","f"), c=c("f","h")), a=list(e=c("g","e"), d=c("e","g"), c=c("e","g")))),
sapply(l, sapply, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, sapply, ncol) == rep(4L, 12)
))
l = split(fdt, by = c("x3","x1"), flatten=FALSE) # multi col rev
test(1639.076, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(4L, 4)
))
l = split(fdt, by = c("x3","x2","x1"), flatten=FALSE)
test(1639.077, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, lapply, names), list(h=list(d=c("b"), e=c("b"), c=c("b")), f=list(e=c("b"), c=c("b"), d=c("b")), g=list(e=c("a"), d=c("a"), c=c("a")), e=list(e=c("a"), d=c("a"), c=c("a")))),
sapply(l, sapply, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, sapply, ncol) == rep(4L, 12)
))
fdt = dt[, .(x1 = x1, # empty levels in factor and drop=FALSE
x2 = x2,
x3 = as.factor(c("a", as.character(x3), "z"))[c(-1L,-14L)],
y = y)]
l = split(fdt, by = c("x3","x1"), flatten=FALSE) # empty levels in factor and drop=FALSE
test(1639.078, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e","a","z")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"), a=character(), z=character())),
identical(lapply(l, lapply, nrow), list(h=list(b=3L), f=list(b=3L), g=list(a=3L), e=list(a=3L), a=structure(list(), .Names = character(0)), z=structure(list(), .Names = character(0)))),
identical(lapply(l, lapply, ncol), list(h=list(b=4L), f=list(b=4L), g=list(a=4L), e=list(a=4L), a=structure(list(), .Names = character(0)), z=structure(list(), .Names = character(0))))
))
l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE) # empty levels in factor and drop=TRUE
test(1639.079, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(4L, 4)
))
l = split(fdt, by = c("x3","x1"), sorted=TRUE, flatten=FALSE) # test order for empty levels in factor and drop=FALSE
test(1639.080, TRUE, all(
is.list(l), identical(names(l), c("a","e","f","g","h","z")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(a=character(), e=c("a"), f=c("b"), g=c("a"), h=c("b"), z=character())),
identical(lapply(l, lapply, nrow), list(a=structure(list(), .Names = character(0)), e=list(a=3L), f=list(b=3L), g=list(a=3L), h=list(b=3L), z=structure(list(), .Names = character(0)))),
identical(lapply(l, lapply, ncol), list(a=structure(list(), .Names = character(0)), e=list(a=4L), f=list(b=4L), g=list(a=4L), h=list(b=4L), z=structure(list(), .Names = character(0))))
))
l = split(fdt, by = c("x3","x1"), sorted=TRUE, drop=TRUE, flatten=FALSE) # test order for empty levels in factor and drop=TRUE
test(1639.081, TRUE, all(
is.list(l), identical(names(l), c("e","f","g","h")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(e=c("a"), f=c("b"), g=c("a"), h=c("b"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(4L, 4)
))
# - [x] split by reference to column names - `by` - for character column ----
l = split(dt, by = "x1") # single col
test(1639.082, TRUE, all(is.list(l), identical(names(l), c("b","a")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L), sapply(l, ncol) == c(b=4L, a=4L)))
l = split(dt, by = "x2")
test(1639.083, TRUE, all(is.list(l), identical(names(l), c("d","e","c")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L), sapply(l, ncol) == c(d=4L, e=4L, c=4L)))
l = split(dt, by = "x3")
test(1639.084, TRUE, all(is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, is.data.table), sapply(l, nrow) == c(h=3L, f=3L, g=3L, e=3L), sapply(l, ncol) == c(h=4L, f=4L, g=4L, e=4L)))
l = split(dt, by = c("x1","x2"), flatten=FALSE) # multi col
test(1639.085, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(b=c("d","e","c"), a=c("e","d","c"))),
sapply(l, sapply, nrow) == rep(2L, 6),
sapply(l, sapply, ncol) == rep(4L, 6)
))
l = split(dt, by = c("x1","x3"), flatten=FALSE)
test(1639.086, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(b=c("h","f"), a=c("g","e"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(4L, 4)
))
l = split(dt, by = c("x2","x3"), flatten=FALSE)
test(1639.087, TRUE, all(
is.list(l), identical(names(l), c("d","e","c")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(d=c("h","f","e","g"), e=c("h","f","g","e"), c=c("f","h","e","g"))),
sapply(l, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, ncol) == rep(4L, 4)
))
l = split(dt, by = c("x1","x2","x3"), flatten=FALSE)
test(1639.088, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, lapply, names), list(b=list(d=c("h","f"), e=c("h","f"), c=c("f","h")), a=list(e=c("g","e"), d=c("e","g"), c=c("e","g")))),
sapply(l, sapply, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, sapply, ncol) == rep(4L, 12)
))
l = split(dt, by = c("x3","x1"), flatten=FALSE) # multi col rev
test(1639.089, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(4L, 4)
))
l = split(dt, by = c("x3","x2","x1"), flatten=FALSE)
test(1639.090, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, lapply, names), list(h=list(d="b", e="b", c="b"), f=list(e="b", c="b", d="b"), g=list(e="a", d="a", c="a"), e=list(e="a",d="a",c="a"))),
sapply(l, sapply, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, sapply, ncol) == rep(4L, 12)
))
# - [x] allow to keep or drop field on which we split - `keep.by` argument ----
l = split(dt, by = "x1", keep.by = FALSE)
test(1639.091, TRUE, all(is.list(l), identical(names(l), c("b","a")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L), sapply(l, ncol) == c(b=3L, a=3L)))
l = split(dt, by = "x2", keep.by = FALSE)
test(1639.092, TRUE, all(is.list(l), identical(names(l), c("d","e","c")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L), sapply(l, ncol) == c(d=3L, e=3L, c=3L)))
l = split(dt, by = "x3", keep.by = FALSE)
test(1639.093, TRUE, all(is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, is.data.table), sapply(l, nrow) == c(h=3L, f=3L, g=3L, e=3L), sapply(l, ncol) == c(h=3L, f=3L, g=3L, e=3L)))
l = split(dt, by = c("x1","x2"), keep.by = FALSE, flatten=FALSE) # multi col
test(1639.094, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(b=c("d","e","c"), a=c("e","d","c"))),
sapply(l, sapply, nrow) == rep(2L, 6),
sapply(l, sapply, ncol) == rep(2L, 6)
))
l = split(dt, by = c("x1","x3"), keep.by = FALSE, flatten=FALSE)
test(1639.095, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(b=c("h","f"), a=c("g","e"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(2L, 4)
))
l = split(dt, by = c("x2","x3"), keep.by = FALSE, flatten=FALSE)
test(1639.096, TRUE, all(
is.list(l), identical(names(l), c("d","e","c")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(d=c("h","f","e","g"), e=c("h","f","g","e"), c=c("f","h","e","g"))),
sapply(l, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, ncol) == rep(2L, 12)
))
l = split(dt, by = c("x1","x2","x3"), keep.by = FALSE, flatten=FALSE)
test(1639.097, TRUE, all(
is.list(l), identical(names(l), c("b","a")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, lapply, names), list(b=list(d=c("h","f"), e=c("h","f"), c=c("f","h")), a=list(e=c("g","e"), d=c("e","g"), c=c("e","g")))),
sapply(l, sapply, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, sapply, ncol) == rep(1L, 12)
))
l = split(dt, by = c("x3","x1"), keep.by = FALSE, flatten=FALSE) # multi col rev
test(1639.098, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))),
sapply(l, sapply, nrow) == rep(3L, 4),
sapply(l, sapply, ncol) == rep(2L, 4)
))
l = split(dt, by = c("x3","x2","x1"), keep.by = FALSE, flatten=FALSE)
test(1639.099, TRUE, all(
is.list(l), identical(names(l), c("h","f","g","e")),
sapply(l, function(x) !is.data.table(x) && is.list(x)),
sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)),
identical(lapply(l, lapply, names), list(h=list(d="b", e="b", c="b"), f=list(e="b", c="b", d="b"), g=list(e="a", d="a", c="a"), e=list(e="a",d="a",c="a"))),
sapply(l, sapply, sapply, nrow) == rep(1L, 12),
sapply(l, sapply, sapply, ncol) == rep(1L, 12)
))
# - [x] support recursive split into nested lists for `length(by) > 2L` (default) and `flatten` arg to produce non-nested list of data.table ----
fdt = dt[, c(lapply(.SD, as.factor), list(y=y)), .SDcols=x1:x3] # factors, flatten consistent to non-flatten length(by)==1L
test(1639.100, split(fdt, by = "x1"), split(fdt, by = "x1", flatten = FALSE)) # length(by) == 1L should be same as flatten=FALSE # ref data already checked in above test
test(1639.101, split(fdt, by = "x2"), split(fdt, by = "x2", flatten = FALSE))
test(1639.102, split(fdt, by = "x3"), split(fdt, by = "x3", flatten = FALSE))
test(1639.103, split(fdt, by = "x1", sorted = TRUE), split(fdt, by = "x1", flatten = FALSE, sorted = TRUE))
test(1639.104, split(fdt, by = "x3", sorted = TRUE), split(fdt, by = "x3", flatten = FALSE, sorted = TRUE))
test(1639.105, split(fdt, by = "x1", sorted = TRUE, drop = TRUE), split(fdt, by = "x1", flatten = FALSE, sorted = TRUE, drop = TRUE))
test(1639.106, split(fdt, by = "x1", sorted = TRUE, keep.by = FALSE), split(fdt, by = "x1", flatten = FALSE, sorted = TRUE, keep.by = FALSE))
test(1639.107, unlist(split(fdt, by = c("x1","x2"), sorted = TRUE, flatten = FALSE), recursive = FALSE), split(fdt, by = c("x1","x2"), sorted = TRUE)) # by two variables - match after unlist nested one # sorted=TRUE
test(1639.108, unlist(split(fdt, by = c("x1","x2"), sorted = FALSE, flatten = FALSE), recursive = FALSE), split(fdt, by = c("x1","x2"), sorted = FALSE)) # sorted=FALSE
test(1639.109, unlist(split(fdt, by = c("x1","x2"), sorted = TRUE, keep.by = FALSE, flatten = FALSE), recursive = FALSE), split(fdt, by = c("x1","x2"), sorted = TRUE, keep.by = FALSE)) # drop.by=TRUE
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2)) # vs split.data.frame by 2L # this will dispatch to `interaction(x1, x2)` which results into different order, see: levels(interaction(1:2,1:2)) vs CJ(1:2,1:2)
test(1639.110, split(fdt, by = c("x1","x2"), sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT))# vs split.data.frame by 2L drop=FALSE
test(1639.111, unlist(split(fdt, by = c("x1","x2"), flatten = FALSE, sorted = TRUE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT))# vs split.data.frame by 2L drop=FALSE, flatten=FALSE + unlist
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2), drop=TRUE)
test(1639.112, split(fdt, by = c("x1","x2"), sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2, fdt$x3)) # vs split.data.frame by 3L
test(1639.113, split(fdt, by = c("x1","x2","x3"), flatten = TRUE, sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 3L drop=FALSE
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2, fdt$x3), drop=TRUE)
test(1639.114, split(fdt, by = c("x1","x2","x3"), flatten = TRUE, sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 3L drop=TRUE
fdt = dt[, .(x1 = as.factor(c(as.character(x1), "c"))[-13L], # empty levels in factors
x2 = as.factor(c("a", as.character(x2)))[-1L],
x3 = as.factor(c("a", as.character(x3), "z"))[c(-1L,-14L)],
y = y)]
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2)) # vs split.data.frame by 2L # this will dispatch to `interaction(x1, x2)` which results into different order, see: levels(interaction(1:2,1:2)) vs CJ(1:2,1:2)
test(1639.115, split(fdt, by = c("x1","x2"), sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE
test(1639.116, unlist(split(fdt, by = c("x1","x2"), flatten = FALSE, sorted = TRUE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE, flatten=FALSE + unlist
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2), drop=TRUE)
test(1639.117, split(fdt, by = c("x1","x2"), sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2, fdt$x3)) # vs split.data.frame by 3L
test(1639.118, split(fdt, by = c("x1","x2","x3"), flatten = TRUE, sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 3L drop=FALSE
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2, fdt$x3), drop=TRUE)
test(1639.119, split(fdt, by = c("x1","x2","x3"), flatten = TRUE, sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 3L drop=TRUE
sdf = split(as.data.frame(fdt[, .SD, .SDcols=c("x3","y")]), f=list(fdt$x1, fdt$x2)) # flatten drop.by and empty lists # this will dispatch to `interaction(x1, x2)` which results into different order, see: levels(interaction(1:2,1:2)) vs CJ(1:2,1:2)
test(1639.120, split(fdt, by = c("x1","x2"), sorted = TRUE, keep.by = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE
test(1639.121, unlist(split(fdt, by = c("x1","x2"), flatten = FALSE, sorted = TRUE, keep.by = FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE, flatten=FALSE + unlist
sdf = split(as.data.frame(fdt[, .SD, .SDcols=c("x3","y")]), f=list(fdt$x1, fdt$x2), drop=TRUE)
test(1639.122, split(fdt, by = c("x1","x2"), sorted = TRUE, drop=TRUE, keep.by = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE
# - [x] edge cases for `by` and `sorted`, 0 rows, 1 unique value in cols, drop ----
test(1639.123, length(split(dt[0L], by = "x1")), 0L) # drop=FALSE vs split.data.frame expand list with empty levels won't work on characters, use factor with defined levels, included those unused.
test(1639.124, length(split(as.data.frame(dt[0L]), df$x1)), 2L) # unlike data.frame because character != factor
fdt = dt[, c(lapply(.SD, as.factor), list(y=y)), .SDcols=x1:x3] # factors no empty levels
test(1639.125, length(split(fdt[0L], by = "x1")), 2L)
test(1639.126, length(split(as.data.frame(fdt[0L]), df$x1)), 2L) # match on factors work
test(1639.127, split(fdt[0L], by = "x1"), lapply(split(as.data.frame(fdt[0L]), df$x1), setDT)) # we match also on complete structure
fdt = dt[, .(x1 = as.factor(c(as.character(x1), "c"))[-13L], # factors empty levels
x2 = as.factor(c("a", as.character(x2)))[-1L],
x3 = as.factor(c("a", as.character(x3), "z"))[c(-1L,-14L)],
y = y)]
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2)) # vs split.data.frame by 2L# this will dispatch to `interaction(x1, x2)` which results into different order, see: levels(interaction(1:2,1:2)) vs CJ(1:2,1:2)
test(1639.128, split(fdt, by = c("x1","x2"), sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE
sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2), drop=TRUE)
test(1639.129, split(fdt, by = c("x1","x2"), sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE
test(1639.130, split(dt[0L], by = "x1"), structure(list(), .Names = character(0))) # 0 nrow character/factor with empty levels # no empty levels
test(1639.131, split(fdt[0L], by = "x1"), lapply(c(a=1L,b=2L,c=3L), function(i) data.table(x1=factor(levels = c("a","b","c")),x2=factor(levels = c("a","c","d","e")),x3=factor(levels = c("a","e","f","g","h","z")),y=numeric()))) # expand empty levels
test(1639.132, split(dt[0L], by = "x1", sorted = TRUE), structure(list(), .Names = character(0)))
test(1639.133, split(fdt[0L], by = "x1", sorted = TRUE), lapply(c(a=1L,b=2L,c=3L), function(i) data.table(x1=factor(levels = c("a","b","c")),x2=factor(levels = c("a","c","d","e")),x3=factor(levels = c("a","e","f","g","h","z")),y=numeric()))) # same as none sorted as all appended on the end in sorted order due to lack of data
dt2 = copy(dt)[, "l" := lapply(1:12, function(i) i)] # non-atomic type to 'by' should raise error
test(1639.134, split(dt2, by = "l"), error = "Argument 'by' must refer only to atomic-type columns, but the following columns are non-atomic: [l]")
# - [x] additional tests for names consistency with data.frame, and current examples in SO
df = data.frame(product = c("b", "a", "b", "a"),
value = c(sample(1:10,4)),
year = c(2001, 2001, 2000, 2000))
tmp = as.data.table(df)[, list(grp=list(.SD)), by=.(product, year), .SDcols=names(df)] # http://stackoverflow.com/a/33068928/2490497
setattr(ans <- tmp$grp, 'names', paste(tmp$product, tmp$year, sep="."))
dt = as.data.table(df) # http://stackoverflow.com/q/33068791/2490497
dt[, grp := .GRP, by = list(product,year)]
setkey(dt, grp)
o2 = dt[, list(list(.SD)), by = grp]$V1
setattr(o2, 'names', paste(tmp$product, tmp$year, sep=".")) # names reused
test(1639.135, o2, ans)
lapply(ans, setattr, ".data.table.locked", NULL)
sort.by.names = function(x) x[sort(names(x))]
test(1639.136, sort.by.names(ans), sort.by.names(split(as.data.table(df), f=list(df$product, df$year))))
test(1639.137, sort.by.names(ans), sort.by.names(unlist(split(setDT(df), by=c("product","year"), flatten = FALSE), recursive = FALSE)))
test(1639.138, ans, split(as.data.table(df), by=c("product","year")))
test(1639.139, sort.by.names(ans), sort.by.names(unlist(split(as.data.table(df), by=c("product","year"), flatten=FALSE), recursive = FALSE)))
# test if split preallocate columns in results #1908
dt = data.table(x=rexp(100),y=rep(LETTERS[1:10], 10))
dtL = split(dt, by = "y")
test(1639.140, dim(dtL[[1]][, x2 := -x]), c(10L,3L))
test(1639.141, all(sapply(dtL, truelength) > 1000))
# test if split does not have scoping issues if one of the variables is 'x' #3151
dt <- data.table(x = factor("a"), y = 1)
test(1639.142, x = split(dt, by = "x"), y = list(a = dt))
test(1639.143, x = split(dt, by = "y"), y = list(`1` = dt))
# allow x's cols (specifically x's join cols) to be referred to using 'x.' syntax
# patch for #1615. Note that I specifically have not implemented x[y, aa, on=c(aa="bb")]
# to refer to x's join column as well because x[i, col] == x[i][, col] will not be TRUE anymore..
x <- data.table(aa = 1:3, cc = letters[1:3])
y <- data.table(bb = 3:5, dd = 3:1)
test(1640.1, x[y, x.aa, on=c(aa="bb")], INT(3,NA,NA))
test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c("c", NA,NA), x.aa=INT(3,NA,NA)))
# tests for non-equi joins
# function to create a random data.table with all necessary columns
nq_fun = function(n=100L) {
i1 = sample(sample(n, 10L), n, TRUE)
i2 = sample(-n/2:n/2, n, TRUE)
i3 = sample(-1e6:1e6, n, TRUE)
i4 = sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE)
d1 = sample(rnorm(10L), n, TRUE)
d2 = sample(rnorm(50), n, TRUE)
d3 = sample(c(Inf, -Inf, NA, NaN, runif(10L)), n, TRUE)
d4 = sample(c(NA, NaN, rnorm(10L)), n, TRUE)
c1 = sample(letters[1:5], n, TRUE)
c2 = sample(LETTERS[1:15], n, TRUE)
dt = data.table(i1,i2,i3,i4, d1,d2,d3,d4, c1,c2)
if (test_bit64) {
I1 = as.integer64(sample(sample(n, 10L), n, TRUE))
I2 = as.integer64(sample(-n/2:n/2, n, TRUE))
I3 = as.integer64(sample(-1e6:1e6, n, TRUE))
I4 = as.integer64(sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE))
dt = cbind(dt, data.table(I1,I2,I3,I4))
}
dt
}
nqjoin_test <- function(x, y, k=1L, test_no, mult="all") {
ops = c("==", ">=", "<=", ">", "<")
xclass = sapply(x, class)
runcmb = combn(names(x), k)
runcmb = as.data.table(runcmb[, 1:min(100L, ncol(runcmb)), drop=FALSE]) # max 100 combinations to test
runops = lapply(runcmb, function(cols) {
thisops = sample(ops, k, TRUE)
thisops[substring(cols,1,1)=="c"] = "=="
thisops
})
is_only_na <- function(x) is.na(x) & !is.nan(x)
construct <- function(cols, vals, ops) {
expr = lapply(seq_along(cols), function(i) {
GT_or_LT = ops[i]==">" || ops[i]=="<"
if (inherits(vals[[i]], "integer64")) {
if (is.na.integer64(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is.na.integer64), as.name(cols[[i]])))
else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]])))
# don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN
} else {
if (is.nan(vals[[i]])) if (GT_or_LT) quote(logical(0)) else as.call(list(quote(is.nan), as.name(cols[[i]])))
else if (is_only_na(vals[[i]])) if (GT_or_LT) quote(logical()) else as.call(list(quote(is_only_na), as.name(cols[[i]])))
else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]]))
}
})
Reduce(function(x,y)call("&",x,y), expr)
}
check <- function(x, y, cols, ops, mult="all") {
# gather just row numbers here and then select all rows once afterwards, rather than rbindlist
rowNums = unlist(lapply(1:nrow(y), function(i) {
e = construct(cols, y[i, ..cols], ops)
rowNums = which(with(x, eval(e))) # raw expression, isolated from both [.data.table overhead and subset optimization
if (!length(rowNums) || mult=="all")
rowNums
else if (mult=="first")
rowNums[1L]
else # mult=="last"
rowNums[length(rowNums)]
}))
x[rowNums]
}
nq <- function(x, y, cols, ops, nomatch=0L, mult="all") {
sd_cols = c(paste0("x.", cols), setdiff(names(x), cols))
ans = x[y, mget(sd_cols, as.environment(-1)), on = paste0(cols, ops, cols), allow.cartesian=TRUE, nomatch=nomatch, mult=mult]
setnames(ans, gsub("^x[.]", "", names(ans)))
setcolorder(ans, names(x))[]
}
for (i in seq_along(runcmb)) {
thiscols = runcmb[[i]]
thisops = runops[[i]]
# cat("k = ", k, "\ti = ", i, "\t thiscols = [", paste0(thiscols,collapse=","), "]\t thisops = [", paste0(thisops,collapse=","), "]\t ", sep="")
ans1 = nq(x, y, thiscols, thisops, 0L, mult=mult)
ans2 = check(x, y, thiscols, thisops, mult=mult)
test_no = test_no + .001
test(test_no, ans1, ans2)
}
gc() # no longer needed but left in place just in case, no harm
}
dt1 = nq_fun(400L)
dt2 = nq_fun(50L)
x = na.omit(dt1)
y = na.omit(dt2)
if (.Machine$sizeof.pointer>4) {
# temporarily off due to hitting 2GB limit on 32bit, #2767
# turn off temporarily using FALSE when using valgrind, too, as very slow
set.seed(1509611616L)
# this fixed seed is to test branch bmerge.c:433 for consistent coverage, issue #2346
# 2nd pass with random seed too removed as taking too long, contributing towards win-builder/cran limits
# without NAs in x and i
nqjoin_test(x, y, 1L, 1641, mult="all")
nqjoin_test(x, y, 2L, 1642, mult="all")
nqjoin_test(x, y, 1L, 1643, mult="first")
nqjoin_test(x, y, 2L, 1644, mult="first")
nqjoin_test(x, y, 1L, 1645, mult="last")
nqjoin_test(x, y, 2L, 1646, mult="last")
# with NAs in x and i
nqjoin_test(dt1, dt2, 1L, 1647, mult="all")
nqjoin_test(dt1, dt2, 2L, 1648, mult="all")
nqjoin_test(dt1, dt2, 1L, 1649, mult="first")
nqjoin_test(dt1, dt2, 2L, 1650, mult="first")
nqjoin_test(dt1, dt2, 1L, 1651, mult="last")
nqjoin_test(dt1, dt2, 2L, 1652, mult="last")
}
# tested, but takes quite some time.. so commenting for now
# nqjoin_test(x, y, 3L,1643.0)
# nqjoin_test(dt1,dt2,3L,1652.0)
# nqjoin_test( x,dt2,1L,1644.0) # without NA only in x
# nqjoin_test( x,dt2,2L,1645.0)
# nqjoin_test( x,dt2,3L,1646.0)
# nqjoin_test(dt1, y,1L,1647.0) # without NA only in i
# nqjoin_test(dt1, y,2L,1648.0)
# nqjoin_test(dt1, y,3L,1649.0)
# test for the issues Jan spotted...
dt = data.table(id="x", a=as.integer(c(3,8,8,15,15,15,16,22,22,25,25)), b=as.integer(c(9,10,25,19,22,25,38,3,9,7,28)), c=as.integer(c(22,33,44,14,49,44,40,25,400,52,77)))
set.seed(1L)
dt=dt[sample(.N)]
test(1653.1, uniqueN(dt[dt, .(x.id, x.a, x.b, x.c, i.id, i.a, i.b, i.c), which=FALSE, on = c("id==id","a>=a","b>=b"), allow.cartesian=TRUE]), 42L)
test(1653.2, x[y, .(x.i1, x.i2, x.i3, x.i4, x.d1, x.d2, x.d3, x.d4, x.c1, x.c2, i.i1, i.i2, i.i3, i.i4, i.d1, i.d2, i.d3, i.d4, i.c1, i.c2), on = c("i4==i4", "i1>=i1", "d4<=d4", "i3==i3", "d3>d3", "i2>i2", "d2>=d2", "d1>d1"), allow.cartesian = TRUE], x[y, .(x.i1, x.i2, x.i3, x.i4, x.d1, x.d2, x.d3, x.d4, x.c1, x.c2, i.i1, i.i2, i.i3, i.i4, i.d1, i.d2, i.d3, i.d4, i.c1, i.c2), on = c("i4==i4", "i1>=i1", "d4<=d4", "i3==i3", "d3>d3", "i2>i2", "d2>=d2", "d1>d1"), allow.cartesian = TRUE]) # ensuring there are no warnings here really..
# error on any op other than "==" on char type
dt1 = data.table(x=sample(letters[1:2], 10, TRUE), y=sample(c(1L,5L,7L), 10, TRUE), z=1:10, k=11:20)
dt2 = data.table(x=c("b", "a"), y=c(1L, 9L))
test(1654, dt1[dt2, on="x>x"], error="Only '==' operator")
# on= with .() syntax, #1257
dt1 = data.table(x=sample(letters[1:2], 10, TRUE), y=sample(c(1L,5L,7L), 10, TRUE), z=1:10, k=11:20)
dt2 = data.table(x=c("b", "a"), y=c(1L, 9L))
test(1655.1, dt1[dt2, on=.(x)], dt1[dt2, on="x"])
test(1655.2, dt1[dt2, on=.(x==x)], dt1[dt2, on=c("x==x")])
test(1655.3, dt1[dt2, on=.(x==x)], dt1[dt2, on=c("x"="x")])
test(1655.4, dt1[dt2, on=.(y>=y)], dt1[dt2, on=c("y>=y")])
test(1655.5, dt1[dt2, on=.(x==x, y>=y)], dt1[dt2, on=c("x==x", "y>=y")])
# Patching another issue spotted by Jan
dt = data.table(id="x", a=as.integer(c(3,8,8,15,15,15,16,22,22,25,25)),
b=as.integer(c(9,10,25,19,22,25,38,3,9,7,28)),
c=as.integer(c(22,33,44,14,49,44,40,25,400,52,77)))
set.seed(1L)
dt=dt[sample(.N)][, row_id := 1:.N]
test(1656, nrow(dt[dt, .(x.id, x.a, x.b, x.c, x.row_id, i.id, i.a, i.b, i.c, i.row_id), on = .(c,b<=b,id,a>=a), allow.cartesian = TRUE]), 12L) # just to check that there's no warning
# between is vectorised, #534
set.seed(1L)
dt = data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE), z=1L+sample(2,10,TRUE))
test(1657, dt[x %between% list(y,z)], dt[x>=y & x<=z])
## fwrite tests
oldverbose = options(datatable.verbose=FALSE) # using output= for these tests so turn off verbose mode
# without quoting
test(1658.01, fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar'))),
output=c("a,b", ",foo", "2,", "3.01,bar"))
# with quoting and qmethod="escape"
test(1658.02, fwrite(data.table(
a=c(NA, 2, 3.01),
`other column`=c('foo bar', NA, 'quote" and \\ bs \n and newline')),
quote=TRUE, qmethod="escape"),
output='"a","other column"\n,"foo bar"\n2,\n3.01,"quote\\" and \\\\ bs \n and newline"')
# with quoting and qmethod="double" (default)
test(1658.03, fwrite(data.table(
a=c(NA, 1.2e-100, 3.01),
"other \"column"=c('foo bar', NA, 'quote" and \\ bs')),
quote=TRUE, qmethod="double"),
output='"a","other ""column"\n,"foo bar"\n1.2e-100,\n3.01,"quote"" and \\ bs"')
# presence of " triggers auto quoting as well, #1925
test(1658.04, fwrite(data.table(a=1:4, b=c('"foo','ba"r','baz"','a "quoted" region'))),
output='a,b\n1,"""foo"\n2,"ba""r"\n3,"baz"""\n4,"a ""quoted"" region"')
test(1658.05, fwrite(data.table(a=1:4, b=c('"foo','ba"r','baz"','a "quoted" region')), qmethod='escape'),
output='a,b\n1,"\\"foo"\n2,"ba\\"r"\n3,"baz\\""\n4,"a \\"quoted\\" region"')
# NB: sep2[2] triggering quoting when list columns are present is tested in test 1736
# changing sep
DT = data.table(a="foo", b="ba\"r")
ans = '"a";"b"\n"foo";"ba""r"'
test(1658.06, fwrite(DT, sep=";", quote=TRUE, qmethod="double"), output=ans)
test(1658.07, write.table(DT, sep=";", qmethod="double", row.names=FALSE), output=ans)
ans = '"a";"b"\n"foo";"ba\\"r"'
test(1658.08, fwrite(DT, sep=";", quote=TRUE, qmethod="escape"), output=ans)
test(1658.09, write.table(DT, sep=";", qmethod="escape", row.names=FALSE), output=ans)
if (.Platform$OS.type=="unix") {
# on linux we can create windows format files if we want
test(1658.10, fwrite(data.table(a="foo", b="bar"), eol="\r\n", quote=TRUE),
output = '"a","b"\n"foo","bar"')
}
# changing NA
test(1658.11, fwrite(data.table(a=c("foo", NA), b=c(1, NA)), na="NA", quote=TRUE),
output='"a","b"\n"foo",1\nNA,NA')
# no col.names
test(1658.12, fwrite(data.table(a="foo", b="bar"), col.names=FALSE, quote=TRUE),
output='"foo","bar"')
test(1658.13, fwrite(data.table(a=c(1:5), b=c(1:5)), quote=TRUE),
output='"a","b"\n1,1\n2,2\n3,3\n4,4\n5,5')
# block size equal to number of rows
test(1658.14, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE),
output='"a","b"\n1,1\n2,2\n3,3')
# block size one bigger than number of rows
test(1658.15, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE),
output='"a","b"\n1,1\n2,2\n3,3')
# block size one less than number of rows
test(1658.16, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE),
output='"a","b"\n1,1\n2,2\n3,3')
# writing a data.frame
test(1658.17, fwrite(data.frame(a="foo", b="bar"), quote=TRUE),
output='"a","b"\n"foo","bar"')
# single-column data.table
test(1658.18, fwrite(data.table(a=c(1,2,3)), quote=TRUE),
output='"a"\n1\n2\n3')
# single-column data.frame
test(1658.19, fwrite(data.frame(a=c(1,2,3)), quote=TRUE),
output='"a"\n1\n2\n3')
# different column types
test(1658.20, fwrite(data.table(
factor1=as.factor(c('foo', 'bar')),
factor2=as.factor(c(NA, "baz")),
bool=c(TRUE,NA),
ints=as.integer(c(NA, 5))), na='na', quote=TRUE, logical01=FALSE),
output='"factor1","factor2","bool","ints"\n"foo",na,TRUE,na\n"bar","baz",na,5')
# empty data table (headers but no rows)
empty_dt <- data.table(a=1, b=2)[0,]
test(1658.21, fwrite(empty_dt, quote=TRUE), output='"a","b"')
# data.table with duplicate column names
test(1658.22, fwrite(data.table(a=1, a=2), quote=TRUE), output='"a","a"\n1,2')
# number of significant digits = 15
test(1658.23, fwrite(data.table(a=1/0.9), quote=TRUE), output='"a"\n1.11111111111111')
# test append
f = tempfile()
fwrite(data.table(a=c(1,2), b=c('a', 'b')), f, quote=TRUE)
fwrite(data.table(a=c(3,4), b=c('c', 'd')), f, append=TRUE, quote=TRUE)
test(1658.24, readLines(f), c('"a","b"','1,"a"','2,"b"','3,"c"','4,"d"'))
unlink(f)
# simple data table (reference for the error cases below)
ok_dt <- data.table(foo="bar")
test(1658.25, fwrite(ok_dt, quote=TRUE), output='"foo"\n"bar"')
# integer NA
DT = data.table(A=c(2L,NA,3L), B=c(NA,4:5))
test(1658.26, fwrite(DT), output='A,B\n2,\n,4\n3,5')
test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing bom .false., yaml .0 characters. and column names .true.*"A","B".*2,NA\nNA,4\n3,5')
# wrong argument types
test(1658.28, fwrite(ok_dt, 1), error="is.character\\(file\\).*not TRUE")
test(1658.29, fwrite(ok_dt, quote=123), error="identical\\(quote.*auto.*FALSE.*TRUE")
test(1658.30, fwrite(ok_dt, sep="..."), error="nchar(sep)")
test(1658.31, fwrite(ok_dt, qmethod=c("double", "double")), error="length(qmethod)")
test(1658.32, fwrite(ok_dt, col.names="foobar"), error="isTRUEorFALSE(col.names)")
# null data table (no columns)
test(1658.33, fwrite(data.table(NULL)), NULL, warning="Nothing to write")
test(1658.34, fwrite(data.table(id=c("A","B","C"), v=c(1.1,0.0,9.9))), output="id,v\nA,1.1\nB,0\nC,9.9")
# logical NA as "NA" when logical01=TRUE, instead of the default na="" which writes all types including <NA> in character column as ,, consistently.
test(1658.35, fwrite(data.table(id=1:3,bool=c(TRUE,NA,FALSE)),na="NA",logical01=TRUE), output="\"id\",\"bool\"\n1,1\n2,NA\n3,0")
# POSIXct
test(1658.36, fwrite(data.table(D = as.POSIXct(seq.Date(as.Date("2038-01-19"), as.Date("2038-01-20"), by = "day")))),
output="D\n2038-01-19T00:00:00Z\n2038-01-20T00:00:00Z")
# input is of class matrix
test(1658.37, fwrite(matrix("foo"), quote=TRUE), output='"V1"\n.*"foo"', message = "x being coerced from class: matrix to data.table")
# ^^ this is to pass testing with verbose=TRUE; there is some verbose output between column names and the data
test(1658.38, fwrite(matrix(1:4, nrow=2, ncol=2), quote = TRUE), output = '"V1","V2"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table")
test(1658.39, fwrite(matrix(1:3, nrow=3, ncol=1), quote = TRUE), output = '"V1"\n.*1\n2\n3', message = "x being coerced from class: matrix to data.table")
test(1658.40, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c("ca","cb"))), quote = TRUE), output = '"ca","cb"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table")
# fwrite compress
test(1658.41, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), output='a,b\n1,1\n2,2\n3,3') # compress ignored on console
DT = data.table(a=rep(1:2,each=100), b=rep(1:4,each=25))
test(1658.421, fwrite(DT, file=f1<-tempfile(fileext=".gz"), verbose=TRUE), NULL,
output="args.nrow=200 args.ncol=2.*maxLineLen=5[12].*Writing 200 rows in 1 batches of 200 rows.*nth=1") # [12] for Windows where eolLen==2
test(1658.422, fwrite(DT, file=f2<-tempfile()), NULL)
test(1658.423, file.info(f1)$size < file.info(f2)$size) # 74 < 804 (file.size() isn't available in R 3.1.0)
if (test_R.utils) test(1658.43, fread(f1), DT) # use fread to decompress gz (works cross-platform)
fwrite(DT, file=f3<-tempfile(), compress="gzip") # compress to filename not ending .gz
test(1658.44, file.info(f3)$size, file.info(f1)$size)
unlink(c(f1,f2,f3))
DT = data.table(a=1:3, b=list(1:4, c(3.14, 100e10), c("foo", "bar", "baz")))
test(1658.45, fwrite(DT), output=c("a,b","1,1|2|3|4","2,3.14|1e+12","3,foo|bar|baz"))
DT[3,b:=as.raw(0:2)]
test(1658.46, fwrite(DT), error="Row 3 of list column is type 'raw'")
DT[3,b:=factor(letters[1:3])]
test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'")
# fwrite bom
DT = data.table(l=letters, n=1:26)
fwrite(DT, f1<-tempfile(), bom=TRUE)
f1con = file(f1, encoding="UTF-8") # Windows readLines needs to be told otherwise it thinks n_lines==1
test(1658.48, length(readLines(f1con)), 27L)
test(1658.49, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e)))
close(f1con)
fwrite(DT, f2<-tempfile(), bom=FALSE)
test(1658.50, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e)))
# re-write to the same file should overwrite.
# Windows seems to cache the connection to f2 and fails on a subsequent read, hence using file(,encoding="UTF-8")
fwrite(DT, f2, bom=TRUE)
f2con = file(f2, encoding="UTF-8")
test(1658.51, length(readLines(f2con)), 27L)
close(f2con)
test(1658.52, file.info(f1)$size, file.info(f2)$size)
unlink(c(f1, f2))
# compression error -5 due to only 3 bytes (bom) in first block; #3599
DT = data.table(l=letters, n=1:26)
test(1658.53, fwrite(DT, file=f<-tempfile(fileext=".gz"), bom=TRUE, col.names=FALSE), NULL)
if (test_R.utils) test(1658.54, fread(f), setnames(DT,c("V1","V2")))
unlink(f)
# complex column support for fwrite, part of #3690
DT = data.table(a=1:3, z=0:2 - (2:0)*1i)
test(1658.55, fwrite(DT), output='a,z\n1,0-2i\n2,1-1i\n3,2+0i')
test(1658.56, fwrite(data.table(exp(1) - pi*1i)), output='2.718[0-9]*-3.141[0-9]*i')
## formerly 1658.46
DT = data.table(a=1:3, b=list(1:4, c(3.14, 100e10), c(3i,4i,5i)))
test(1658.57, fwrite(DT), output='0+3i|0+4i|0+5i')
DT[ , b := c(1i, -1-1i, NA)]
test(1658.58, fwrite(DT), output='a,b\n1,0\\+1i\n2,-1-1i\n3,$')
# more coverage
test(1658.59, fwrite(data.table(a=list('a')), verbose=TRUE),
output='fields will be quoted if the field contains either sep.*sep2.*list column')
test(1658.60, fwrite(data.table(r=as.raw(0))), error = "'raw' - not yet implemented")
options(oldverbose)
## End fwrite tests
# tests for #679, inrange(), FR #707
dt = data.table(a=c(8,3,10,7,-10), val=runif(5))
range = data.table(start = 1:5, end = 6:10)
test(1659.1, dt[a %inrange% range], dt[1:4])
test(1659.2, dt[inrange(a, range$start, range$end)], dt[1:4])
test(1659.3, dt[inrange(a, range$start, range$end, incbounds=FALSE)], dt[c(1,2,4)])
range[4, `:=`(start=-12L, end=-4L)]
test(1659.4, dt[a %inrange% range], dt)
# tests for non-equi joins returning columns correctly when j is missing
dt1 = fread('Chr Start End Region
chr6 3324 3360 Region1
chr4 2445 2455 Region2
chr1 1034 1090 Region4')
dt2 = fread('Site Chr Location Gene
Site1 chr4 2447 GeneB
Site2 chr9 1153 GeneT
Site3 chr6 3350 GeneM
Site4 chr1 1034 GeneC
Site5 chr1 2000 GeneU
Site6 chr6 3359 GeneF
Site7 chr7 1158 GeneI
Site8 chr4 2451 GeneO
Site9 chr6 3367 GeneZ ')
test(1660.1, names(dt2[dt1, on=.(Chr, Location>=Start, Location<=End)]), c(names(dt2), "Location.1", "Region"))
test(1660.2, names(dt1[dt2, on=.(Chr, Start<=Location, End>=Location)]), c(names(dt1), "Site", "Gene"))
# `names<-` should NOT modify by reference #1015
DT = data.table(x=1, y=2)
nn = names(DT)
test(1661.1, {names(DT) <- c("k", "m"); nn}, c("x","y"))
test(1661.2, names(DT), c("k","m"))
# rbindlist support for complex type
dt1 = data.table(x=1L, y=2+3i)
dt2 = data.table(x=0:101, y=3+sample(102)*1i)
test(1665.1, rbindlist(list(dt1,dt2)), setDT(rbind(as.data.frame(dt1), as.data.frame(dt2))))
# print method now works (when rows > 100 it uses rbind/rbindlist internally)
test(1665.2, ans <- capture.output(dt2), ans) # just checking that it doesn't error, really.
# Use existing index even when auto index is disabled #1422
d = data.table(k=3:1) # subset - no index
options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
test(1666.01, d[k==1L, verbose=TRUE], d[3L], output="Creating new index 'k'")
d = data.table(k=3:1)
options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE)
test(1666.02, grep("Creating new index", capture.output(d[k==1L, verbose=TRUE])), integer(0)) # do not create index
d = data.table(k=3:1)
options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE)
test(1666.03, grep("Creating new index", capture.output(d[k==1L, verbose=TRUE])), integer(0))
d = data.table(k=3:1)
options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE)
test(1666.04, grep("Creating new index", capture.output(d[k==1L, verbose=TRUE])), integer(0))
d = data.table(k=3:1) # subset - index
setindex(d, k)
options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
test(1666.05, d[k==1L, verbose=TRUE], d[3L], output="Optimized subsetting with index 'k'")
options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE)
test(1666.06, d[k==1L, verbose=TRUE], d[3L], output="Optimized subsetting with index 'k'")
options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE)
test(1666.07, grep("Using existing index", capture.output(d[k==1L, verbose=TRUE])), integer(0)) # not using existing index
options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE)
test(1666.08, grep("Using existing index", capture.output(d[k==1L, verbose=TRUE])), integer(0))
d1 = data.table(k=3:1) # join - no index
d2 = data.table(k=2:4)
options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
test(1666.09, d1[d2, on="k", verbose=TRUE], d1[d2, on="k"], output="ad hoc")
options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE)
test(1666.10, d1[d2, on="k", verbose=TRUE], d1[d2, on="k"], output="ad hoc")
options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE)
test(1666.11, grep("Looking for existing (secondary) index", capture.output(d1[d2, on="k", verbose=TRUE])), integer(0)) # not looking for index
options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE)
test(1666.12, grep("Looking for existing (secondary) index", capture.output(d1[d2, on="k", verbose=TRUE])), integer(0))
d1 = data.table(k=3:1,v1=10:12) # join - index
d2 = data.table(k=2:4,v2=20:22)
setindex(d1, k)
ans = data.table(k=2:4, v1=c(11L,10L,NA), v2=20:22)
options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
test(1666.13, d1[d2, on="k", verbose=TRUE], ans, output="existing index")
options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE)
test(1666.14, d1[d2, on="k", verbose=TRUE], ans, output="existing index")
options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE)
test(1666.15, d1[d2, on="k", verbose=TRUE], ans, output='ad hoc')
options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE)
test(1666.16, d1[d2, on="k", verbose=TRUE], ans, output='ad hoc')
# reset defaults
options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE)
#testing fix to #1654 (dcast should only error when _using_ duplicated names)
DT <- data.table(a = 1:4, a = 1:4, id = rep(1:4, 2), V1 = 8:1)
test(1667.1, dcast(DT, id ~ rowid(id), value.var = "V1"),
output = " id 1 2\n1: 1 8 4\n2: 2 7 3\n3: 3 6 2\n4: 4 5 1")
DT <- data.table(a = 1:4, id = 1:4, id = rep(1:4, 2), V1 = 8:1)
test(1667.2, dcast(DT, id ~ rowid(id), value.var = "V1"), error = "data.table to cast")
# fix for #1672
test(1668, chmatch(c("a","b"), c("a","c"), nomatch = integer()), c(1L, NA_integer_))
# fix for #1650, segfault in rolling joins resulting from fixing #1405.
x = data.table(Date = as.Date(c("2015-12-29", "2015-12-29", "2015-12-29", "2015-12-29", "2016-01-30", "2016-01-30",
"2016-01-30", "2016-01-30", "2016-02-29", "2016-02-29", "2016-02-29", "2016-02-29",
"2016-03-26", "2016-03-26", "2016-03-26", "2016-03-26")),
ID = c("A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D"),
Value = c("A201512", "B201512", "C201512", "D201512", "A201601", "B201601", "C201601", "D201601",
"A201602", "B201602", "C201602", "D201602", "A201603", "B201603", "C201603", "D201603"),
key = c('Date', 'ID'))
y = CJ(Date = as.Date(c("2015-12-31", "2016-01-31", "2016-02-29", "2016-03-31")), ID = unique(x$ID))
test(1669, x[y, on=c("ID", "Date"), roll=TRUE, which=TRUE], 1:16)
# 1680 fix, fread header encoding issue
x = "Stra\xdfe"
Encoding(x) = "latin1"
nm = names(fread(testDir("1680-fread-header-encoding.csv"), encoding="Latin-1"))
test(1670, nm[2], x)
# as.data.table must return a copy even if 'x' is a data.table
x = data.table(a=1, b=2)
test(1670.1, address(x) != address(as.data.table(x)), TRUE)
setattr(x, 'class', c('a', class(x)))
test(1670.2, class(as.data.table(x)), class(x)[2:3])
# #1676, `:=` with by shouldn't add cols on supported types
dt = data.table(x=1, y=2)
test(1671, dt[, z := sd, by=x], error="invalid type/length (closure/1)")
# 1683
DT <- data.table(V1 = rep(1:2, 3), V2 = 1:6)
test(1672.1, DT[ , .(.I[1L], V2[1L]), by = V1],
output = " V1 V1 V2\n1: 1 1 1\n2: 2 2 2")
#make sure GForce operating
test(1672.2, DT[ , .(.I[1L], V2[1L]), by = V1, verbose = TRUE],
output = "GForce optimized j")
#make sure GForce not operating for inversion
test(1672.3, DT[ , .(.I[-1L], V2[1L]), by = V1, verbose = TRUE],
output = "GForce FALSE")
#make sure works on .I by itself
test(1672.4, DT[ , .I[1L], by = V1],
output = " V1 V1\n1: 1 1\n2: 2 2")
#make sure GForce here as well
test(1672.5, DT[ , .I[1L], by = V1, verbose = TRUE],
output = "GForce optimized j")
#make sure works with order
test(1672.6, DT[order(V1), .I[1L], by = V1],
output = " V1 V1\n1: 1 1\n2: 2 2")
# should also work with subsetting
test(1672.7, DT[1:5, .(.I[1L], V2[1L]), by = V1],
output = " V1 V1 V2\n1: 1 1 1\n2: 2 2 2")
#tests for #1528
TT <- as.IDate("2016-04-25")
test(1673.1, TT + 4L, as.IDate("2016-04-29"))
test(1673.2, TT + 4, as.IDate("2016-04-29"))
test(1673.3, TT - 3, as.IDate("2016-04-22"))
test(1673.4, TT - 3L, as.IDate("2016-04-22"))
test(1673.5, as.IDate("2016-04-28") - as.IDate("2016-04-20"), 8L)
# test for radix integer order when MAXINT is present AND decreasing=TRUE AND na.last=FALSE
# https://bugs.r-project.org/bugzilla/show_bug.cgi?id=16925
# It seems this 'just' fails ASAN, but also results in seg fault under some compilers
# https://github.com/rstudio/shiny/issues/1200
test(1674, forderv(c(2147483645L, 2147483646L, 2147483647L, 2147483644L), order=-1L), INT(3,2,1,4))
# fix for #1718
# In R-devel somwhere between 12 June 2017 (r72786) and 27 June 2017 (r72859), the behaviour of factor() changed.
# Test updated minimally to create the previous representation directly instead of going via factor().
A = data.table(foo = c(1, 2, 3), bar = c(4, 5, 6))
A[, bar := factor(bar, levels = c(4, 5), labels = c("Boop", "Beep"), exclude = 6)]
B = data.table(foo = c(1, 2, 3, 4, 5, 6), bar = structure(c(3L, 3L, 3L, 1L, 2L, NA), .Label=c("Boop","Beep",NA), class="factor"))
test(1675.1, as.integer(B[A, bar := i.bar, on="foo"]$bar), c(1:2,NA,1:2,NA)) # remove the NA level is change in v1.12.4
A = data.table(foo = c(1, 2, 3), bar = c(4, 5, 6))
B = data.table(foo = c(1, 2, 3, 4, 5, 6), bar = c(NA, NA, NA, 4, 5, 6))
A[, bar := factor(bar, levels = c(4, 5), labels = c("Boop", "Beep"), exclude = 6)]
B[, bar := factor(bar, levels = c(4, 5), labels = c("Boop", "Beep"), exclude = 6)]
test(1675.2, as.integer(B[A, bar := i.bar, on="foo"]$bar), c(1:2,NA,1:2,NA))
# fwrite na arg segfault fix, #1725
dt = data.table(x=1:2, y=c(NA,"a"))
f = tempfile()
test(1676.1, fwrite(dt, f, na=NULL), error="is not TRUE")
fwrite(dt, f, na=NA)
test(1676.2, fread(f), data.table(x=1:2, y=c(NA, "a")))
unlink(f)
# duplicate names in foverlaps #1730
a = data.table(start = 1:5, end = 2:6, c2 = rnorm(10), c2 = rnorm(10), key=c("start","end"))
b = data.table(start = 1:5, end = 2:6, c3 = rnorm(5), key=c("start","end"))
test(1677.1, foverlaps(a, b), error="x has some duplicated column")
test(1677.2, foverlaps(b, a), error="y has some duplicated column")
# na.omit.data.table removes indices #1734
dt = data.table(a=4:1, b=c(letters[c(1L,NA,2:3)]))
setindexv(dt, "a")
test(1678.1, indices(dt2 <- na.omit(dt, cols="b")), NULL)
setindexv(dt2, "a")
test(1678.2, indices(na.omit(dt2, cols="b")), "a")
# rleid gains `prefix` argument, similar to rowid
x = sample(3,10,TRUE)
test(1679.1, rleid(x, prefix="id"), paste0("id", rleid(x)))
test(1679.2, rleidv(x, prefix="id"), paste0("id", rleidv(x)))
# melt.data.table call along with patterns from within a function, #1749
x = data.table(x1=1:2, x2=3:4, y1=5:6, y2=7:8, z1=9:10, z2=11:12)
foo <- function(x) {
pats = c("^y", "z")
melt(x, measure.vars=patterns(pats))
}
test(1680.1, foo(x), melt(x, measure.vars=patterns("^y", "^z")))
# melt warning prints only first 5 cols, #1752
if (test_R.utils) {
DT = fread(testDir("melt-warning-1752.tsv.gz"))
ans = suppressWarnings(melt(DT[, names(DT) %like% "(^Id[0-9]*$)|GEOGRAPHIC AREA CODES", with=FALSE], id=1:2))
test(1681, melt(DT[, names(DT) %like% "(^Id[0-9]*$)|GEOGRAPHIC AREA CODES", with=FALSE], id.vars=1:2),
ans, warning="are not all of the same type")
}
# non-equi joins with by=.EACHI, not as exhaustive, but given the previous
# tests were, this should be fine.. we'll add tests as we go along.
set.seed(45L)
dt1 = data.table(x=sample(8,20,TRUE), y=sample(8,20,TRUE), z=1:20)
dt2 = data.table(c(2,5), c(5,7), c(2,4))
dt3 = data.table(c(12,5), c(15,7), c(2,4))
test(1682.1, dt1[dt2, .N, by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt2, on=.(x>=V1, y<=V2)][, .N, by=.(x,y)])
test(1682.2, dt1[dt2, sum(z), by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt2, on=.(x>=V1, y<=V2)][, sum(z), by=.(x,y)])
test(1682.3, dt1[dt2, as.numeric(median(z)), by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt2, on=.(x>=V1, y<=V2)][, median(z), by=.(x,y)])
test(1682.4, dt1[dt3, .N, by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt3, on=.(x>=V1, y<=V2)][, .(N=sum(!is.na(z))), by=.(x,y)])
test(1682.5, dt1[dt3, .N, by=.EACHI, on=.(x>=V1, y<=V2), nomatch=0L], dt1[dt3, on=.(x>=V1, y<=V2), nomatch=0L][, .N, by=.(x,y)])
test(1682.6, dt1[dt2, on=.(x>=V1, y<=V2), sum(z)*V3, by=.EACHI], dt1[dt2, on=.(x>=V1, y<=V2)][, sum(z)*V3[1L], by=.(x,y)])
test(1682.7, dt1[dt3, on=.(x>=V1, y<=V2), sum(z)*V3, by=.EACHI], dt1[dt3, on=.(x>=V1, y<=V2)][, sum(z)*V3[1L], by=.(x,y)])
# add test for update operation
idx = dt1[dt2[1], which=TRUE, on=.(x>=V1, y<=V2)]
test(1682.8, copy(dt1)[dt2[1], z := 2L*z, by=.EACHI, on=.(x>=V1, y<=V2)], copy(dt1)[(idx), z := 2L*z])
# test for add by reference
test(1682.9, copy(dt1)[dt2[1], foo := z, by=.EACHI, on=.(x>=V1, y<=V2)], copy(dt1)[(idx), foo := z])
# test for nomatch=0L with by=.EACHI fix for non-equi joins
dt = data.table(x=c(1,4,7,10), y=c(6,12,18,24), z=4:1)
test(1683.1, dt[.(c(2,15), c(100,25)), sum(z), on=.(x>=V1, y<=V2), by=.EACHI], data.table(x=c(2,15), y=c(100,25), V1=c(6L, NA)))
test(1683.2, dt[.(c(2,15), c(100,25)), sum(z), on=.(x>=V1, y<=V2), by=.EACHI, nomatch=0L], data.table(x=2, y=100, V1=6L))
# unique should remove index #1760
dt <- data.table(a = c("1", "1", "2", "2", "3", "4", "4", "4"),
b = letters[1:8],
d = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE))
dt[d == TRUE, `:=`(b = "M")] # create index
udt <- unique(dt, by = c("a", "b"))
test(1684, nrow(udt[d == TRUE]), 2L)
# #1758, data.table print issue
foo <- function(annot=c("a", "b")) {
dt = data.table(x=annot, y=NA)
ro = structure(list(dt=dt), class="dtu")
suppressWarnings(ro$dt[, flag := TRUE])
ro
}
old = options(datatable.verbose=FALSE)
test(1685, grep("dtu", capture.output(foo())), 7L)
options(old)
# fix for #1771
test(1685.1, uniqueN(1L), 1L)
test(1685.2, uniqueN(1L, na.rm=TRUE), 1L)
# fix for #1744
DT = data.table(ID = 1:2, A = 3:4, B = 5:6)
test(1686.1, DT[, .(A,B)], DT[, c(mget("A"), .SD), .SDcols="B"])
test(1686.2, DT[, .(V1=A,B)], DT[, c(.(get("A")), .SD), .SDcols="B"])
# tests for first
test(1687.1, first(1:5), 1L)
test(1687.2, first(data.table(x=1:5, y=6:10)), data.table(x=1L, y=6L))
test(1687.3, first(integer(0L)), integer(0L))
test(1687.4, first(1:5, 2), 1:2)
if (test_bit64) {
# fix for #1385 and part of #1459
x1 = data.table(id=1, value=as.integer64(1))
x2 = data.table(id=c(1,2))
test(1688.1, merge(x2, x1, by="id", all.x=TRUE)$value, as.integer64(c(1,NA)))
x1 = data.table(x = c(1),y = integer64(1))
x2 = data.table(x = c(1,2))
test(1688.2, merge(x1, x2, all=TRUE, by="x")$y, as.integer64(c(0, NA)))
}
# Check that tz is passed though to as.Date(), #1498
date_tz = as.POSIXct("2016/01/13 17:00", tz = "America/Los_Angeles")
test(1689.1, capture.output(IDateTime(date_tz)), c(" idate itime", "1: 2016-01-13 17:00:00"))
# and that as.IDate.POSIXct is exported too
test(1689.2, as.IDate(date_tz), output="2016-01-13")
date_tz = structure(1496275200.11903, class = c("POSIXct", "POSIXt"), tzone = "America/Los_Angeles")
test(1689.3, as.character(as.IDate(date_tz)), "2017-05-31")
test(1689.4, as.character(as.IDate(date_tz, tz="UTC")), "2017-06-01")
# fix for #1766 and #1704
A = data.table(i = 1:6, j = rep(1:2, 3), x = letters[1:6], key = "i")
B = data.table(j = 1:2, y = letters[1:2], key = "j")
test(1690.1, key(A[B, on = "j"]), NULL)
test(1690.2, key(A[B, on = "j"]), NULL)
dt <- data.table(
origin = c("A", "A", "A", "A", "A", "A", "B", "B", "A", "A", "C", "C", "B", "B", "B", "B", "B", "C", "C", "B", "A", "C", "C", "C", "C", "C", "A", "A", "C", "C", "B", "B"),
destination = c("A", "A", "A", "A", "B", "B", "A", "A", "C", "C", "A", "A", "B", "B", "B", "C", "C", "B", "B", "A", "B", "C", "C", "C", "A", "A", "C", "C", "B", "B", "C", "C"),
points_in_dest = c(5, 5, 5, 5, 4, 4, 5, 5, 3, 3, 5, 5, 4, 4, 4, 3, 3, 4, 4, 5, 4, 3, 3, 3, 5,5, 3, 3, 4, 4, 3, 3),
depart_time = c(7, 8, 16, 18, 7, 8, 16, 18, 7, 8, 16, 18, 7, 8, 16, 7, 8, 16, 18, 8, 16, 7, 8, 18, 7, 8, 16, 18, 7, 8, 16, 18),
travel_time = c(0, 0, 0, 0, 70, 10, 70, 10, 10, 10, 70, 70, 0, 0, 0, 70, 10, 10, 70, 70, 10, 0, 0, 0, 10, 70, 10, 70, 10, 70, 70, 10))
dt[ depart_time<=8 & travel_time < 60, condition1 := TRUE]
dt[ depart_time>=16 & travel_time < 60, condition2 := TRUE]
setkey(dt, origin, destination)
res <- unique(dt[(condition1)],by=key(dt))[unique(dt[(condition2)], by=key(dt)),
on = c(destination = "origin", origin = "destination"),
nomatch = 0L]
test(1690.3, res[, .(points = sum(points_in_dest)), keyby = origin], data.table(origin=LETTERS[1:3], points=c(9,7,12), key="origin"))
# fix for #1626 (so that rbind plays nicely with non-list inputs, e.g., package
# psych creates a list with the input data.frame/data.table and a matrix it
# creates...)
dt = data.table(x=1:5, y=6:10)
test(1691, rbind(dt, dt), rbind(dt, as.matrix(dt)))
# For #1783 -- subsetting a data.table by an ITime object
test(1692, capture.output(as.data.table(structure(57600L, class = "ITime"))),
c(" V1", "1: 16:00:00"))
# testing all time part extraction routines (subsumes #874)
t <- "2016-08-03 01:02:03.45"
test(1693.1, second(t), 3L)
test(1693.2, minute(t), 2L)
test(1693.3, hour(t), 1L)
test(1693.4, yday(t), 216L)
test(1693.5, wday(t), 4L)
test(1693.6, week(t), 31L)
test(1693.7, month(t), 8L)
test(1693.8, quarter(t), 3L)
test(1693.9, year(t), 2016L)
# fix for #1740 - sub-assigning NAs for factors
dt = data.table(x = 1:5, y = factor(c("","a","b","a", "")), z = 5:9)
ans = data.table(x = 1:5, y = factor(c(NA,"a","b","a", NA)), z = 5:9)
test(1694.0, dt[y=="", y := NA], ans)
# more tests for between()
x = c(NaN, NA, 1, 5, -Inf, Inf)
test(1695.01, x %between% c(3, 7), c(NA, NA, FALSE, TRUE, FALSE, FALSE))
test(1695.02, x %between% c(NA, 7), c(NA, NA, TRUE, TRUE, TRUE, FALSE))
test(1695.03, x %between% c(3, NA), c(NA, NA, FALSE, TRUE, FALSE, TRUE))
test(1695.04, x %between% c(NA, NA), c(NA, NA, TRUE, TRUE, TRUE,TRUE))
test(1695.05, x %between% c(NA_real_, NA_real_), c(NA, NA, TRUE, TRUE, TRUE,TRUE))
test(1695.06, x %between% list(c(1,2,3,4,5,6), 10), c(NA, NA, FALSE, TRUE, FALSE, FALSE))
x = c(NA, 1L, 5L)
test(1695.07, x %between% c(3, 7), c(NA, FALSE, TRUE))
test(1695.08, x %between% c(NA, 7), c(NA, TRUE, TRUE))
test(1695.09, x %between% c(3, NA), c(NA, FALSE, TRUE))
test(1695.11, x %between% c(NA, NA), c(NA, TRUE, TRUE))
x = rep(NA_integer_, 3)
test(1695.12, x %between% c(3, 7), rep(NA, 3L))
test(1695.13, x %between% c(NA, 7), rep(NA, 3L))
test(1695.14, x %between% c(3, NA), rep(NA, 3L))
test(1695.15, x %between% c(NA, NA), rep(NA, 3L))
x = integer(0)
test(1695.16, x %between% c(3, 7), logical(0))
test(1695.17, TRUE %between% c(3, 7), error="between has been passed an argument x of type logical")
x = c("foo","bar","paz")
test(1695.18, between(x, "bag", "fog"), c(FALSE, TRUE, FALSE))
test(1695.19, between(x, c("b","f","a"), "q"), c(TRUE, FALSE, TRUE))
test(1695.20, between(x, c("foo","baq","bar"), "paz", incbounds=TRUE), c(TRUE, TRUE, TRUE))
test(1695.21, between(x, c("foo","baq","bar"), "paz", incbounds=FALSE), c(FALSE, TRUE, FALSE))
x = c(3.14, 3.20, -42, Inf)
test(1695.22, between(x, c(3,4,-60,5), c(3.14,10,-30,Inf)), c(TRUE,FALSE,TRUE,TRUE))
test(1695.23, between(x, c(3,4,-60,5), c(3.14,10,-30,Inf), incbounds=FALSE), c(FALSE,FALSE,TRUE,FALSE))
test(1695.24, between(x, c(3,4,-60), 99), error="Incompatible vector lengths: length(x)==4 length(lower)==3 length(upper)==1. Each should be either length 1 or the length of the longest")
test(1695.25, between(x, c(3,4,-60,5), c(99,98)), error="Incompatible vector lengths: length(x)==4 length(lower)==4 length(upper)==2. Each should be either length 1 or the length of the longest")
test(1695.26, between(0, -5:2, -2:5), c(FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE))
test(1695.27, between(0:1, -5:2, -2:5), error="Incompatible vector lengths: length(x)==2 length(lower)==8 length(upper)==8. Each should be either length 1 or the length of the longest")
test(1695.28, between(0, -5:2, -2:4), error="Incompatible vector lengths: length(x)==1 length(lower)==8 length(upper)==7. Each should be either length 1 or the length of the longest")
# test for #1819, verbose message for bmerge
options(datatable.verbose = TRUE)
x = data.table(A = 10:17)
test(1696.0, x[A %inrange% 13:14], output="bmerge")
# restore verbosity
options(datatable.verbose = FALSE)
# adding a test for #1825 (though it is not on timing, but correctness while
# joining on keyed tables using 'on' argument)
x = data.table(a=1:3, b=4:6, key="a")
y = data.table(a=2:4, c=7:9)
test(1697.1, x[y], x[y, on=key(x)])
y = data.table(m=2:4, c=7:9, key="m")
test(1697.2, x[y], x[y, on=c(a="m")])
# #1823, fix for 'on='' on keyed anti-joins loses key
x = data.table(id = 1:10, val = letters[1:10], key = "id")
y = data.table(id = 3:6, key = "id")
test(1698.1, key(x[!y]), key(x[!y, on = "id"]))
# minor enhancement to dcast, #1821
dt = data.table(x=c(1,1,1,2,2,2), y=1:6, z=6:1)
test(1699.1, dcast(dt, x ~ ., value.var="z", fun.aggregate=list(sd, mean)), data.table(x=c(1,2), z_sd=1, z_mean=c(5,2), key="x"))
# minor enhancement to dcast, #1810
dt = data.table(
var1 = c("a","b","c","b","d","e","f"),
var2 = c("aa","bb","cc","dd","ee","ee","ff"),
subtype = c("1","2","2","2","1","1","2"),
type = c("A","A","A","A","B","B","B")
)
test(1700.1, dcast(dt, type ~ subtype, value.var = c("var1", "var2"), fun.aggregate = function(v) paste0(unique(v), collapse = "|")),
data.table(type=c("A","B"), var1_1=c("a", "d|e"), var1_2=c("b|c", "f"),
var2_1=c("aa", "ee"), var2_2=c("bb|cc|dd","ff"), key="type"))
# fixing regression introduced on providing functionality of 'x.' prefix in 'j' (for non-equi joins)
A = data.table(x=c(1,1,1,2,2), y=1:5, z=5:1)
B = data.table(x=c(2,3), val=4)
col1 = "y"
col2 = "x.y"
test(1701.1, A[, .(length(x), length(y)), by=x], data.table(x=c(1,2), V1=1L, V2=c(3:2)))
test(1701.2, A[, .(x), by=x], data.table(x=c(1,2), x=c(1,2)))
test(1701.3, A[B, x.x, on="x"], c(2,2,NA))
test(1701.4, A[B, x.y, on="x"], c(4:5,NA))
test(1701.5, A[B, .(get("x"), get("x.x")), on="x"], data.table(V1=c(2,2,3), V2=c(2,2,NA)))
test(1701.6, A[B, mget(c("x", "x.x")), on="x"], data.table(x=c(2,2,3), x.x=c(2,2,NA)))
# 1761 fix as well
test(1701.7, A[B, .(x.x, get("x.x"), x.y), on="x", by=.EACHI], data.table(x=c(2,2,3), x.x=c(2,2,NA), V2=c(2,2,NA), x.y=c(4:5,NA)))
dt = data.table(a=1L)
test(1701.8, dt[dt, .(xa=x.a, ia=i.a), .EACHI, on="a"], data.table(a=1L, xa=1L, ia=1L))
# ISO 8601-consistent week numbering, #1765
# test cases via https://en.wikipedia.org/wiki/ISO_week_date
# as well as specified in relation to #2407
test_cases <- c("2005-01-01", "2005-01-02", "2005-12-31",
"2007-01-01", "2007-12-30", "2007-12-31",
"2008-01-01", "2008-12-28", "2008-12-29",
"2008-12-30", "2008-12-31", "2009-01-01",
"2009-12-31", "2010-01-01",
"2010-01-02", "2010-01-03",
#see https://stackoverflow.com/questions/43944430 & #2407
"2014-12-29", "2014-12-22", "2015-02-02")
test_values <- c(53L, 53L, 52L, 1L, 52L, 1L, 1L,
52L, 1L, 1L, 1L, 1L, 53L, 53L, 53L, 53L,
1L, 52L, 6L)
test(1702.1, isoweek(test_cases), test_values)
# calculating via character skirts timezone issues,
# but calculating from Date brings these into play, #2407
test(1702.2, isoweek(as.Date(test_cases)), test_values)
# *** OBSCURE ERROR WHEN Sys.timezone() = 'America/Argentina/Buenos_Aires' ***
test(1702.3, isoweek(as.POSIXct(test_cases)), test_values)
# 1% sample of a 400-year cycle of dates for extra robustness
if (test_R.utils) test(1702.4, isoweek((DT<-fread(testDir('isoweek_test.csv.bz2')))$input_date), DT$expected_output)
# fread, ensure no shell commands #1702
if (.Platform$OS.type=="unix") {
cat("a,b\n4,2", file=f<-tempfile())
cmd <- sprintf("cat %s", f)
options(datatable.fread.input.cmd.message = TRUE)
test(1703.01, fread(cmd), ans<-data.table(a=4L, b=2L), message="Please use fread.cmd=.*security concern.*Please read item 5 in the NEWS file for v1.11.6")
options(datatable.fread.input.cmd.message = NULL) # when option is missing as it is by default, then TRUE
test(1703.02, fread(cmd), ans, message="security concern")
options(datatable.fread.input.cmd.message = FALSE)
test(1703.03, tryCatch(fread(cmd), message=stop), ans)
options(datatable.fread.input.cmd.message = NULL)
test(1703.04, fread(cmd=cmd), ans)
test(1703.05, fread(file=cmd), error=sprintf("File '%s' does not exist", cmd))
unlink(f)
# Test 'text' argument
test(1703.06, fread(text="https://example.com,A\na,b"), data.table(`https://example.com`='a', A='b'))
# errors
test(1703.07, fread(input="A,B", text="A,B"), error="Used more than one of the arguments input=, file=, text= and cmd=")
test(1703.08, fread(file="A,B", text="A,B"), error="Used more than one of the arguments input=, file=, text= and cmd=")
test(1703.09, fread(input="A,B", file="A,B"), error="Used more than one of the arguments input=, file=, text= and cmd=")
test(1703.10, fread(text=1), error="'text=' is type double but must be character")
# Zero-length text
test(1703.11, fread(text = character(0)), data.table())
# Multi-length text
test(1703.12, fread(text=c("A B","C D"), sep=" ", data.table=FALSE), y=read.table(text=c("A B","C D"), as.is=TRUE, header=TRUE))
# test error if length>1 character passed to input, not text:
test(1703.13, fread(c("foo","bar","baz")), error="must be a single character string")
cat("a,b\n8,4", file=f<-tempfile("has space"))
test(1703.14, fread(f), data.table(a=8L, b=4L)) # if file with space exists, the file name takes precedence not command
unlink(f)
}
test(1703.15, fread("."), error="File '.' is a directory. Not yet implemented.")
# tmpdir argument
d = tempfile("dir")
test(1703.16, fread(text=c('a,b','1,2'), tmpdir=d), error="cannot open the connection", warning="No such file or directory")
dir.create(d)
test(1703.17, fread(text=c('a,b','1,2'), tmpdir=d), data.table(a=1L,b=2L))
test(1703.18, fread(text=c('a,b','1,2')), data.table(a=1L, b=2L))
unlink(d)
# Ensure all.equal respects 'check.attributes' w.r.t. column names. As testthat::check_equivalent relies on this
# as used by package popEpi in its tests
test(1704, all.equal(data.table( a=1:3, b=4:6 ), data.table( A=1:3, B=4:6 ), check.attributes=FALSE))
# all.equal.data.table should consider modes equal like base R (detected via Bioc's flowWorkspace tests)
test(1707.1, all.equal( data.frame(a=0L), data.frame(a=0) ) )
test(1707.2, all.equal( data.table(a=0L), data.table(a=0) ) )
test(1708.1, !isTRUE(all.equal( data.frame(a=0L), data.frame(a=FALSE) )))
test(1708.2, all.equal( data.table(a=0L), data.table(a=FALSE) ),
"Datasets have different column modes. First 3: a(numeric!=logical)")
x = data.frame(a=0L)
y = data.frame(a=0)
setattr(y[[1]],"class",c("hello","world"))
test(1709.1, !isTRUE(all.equal(x,y,check.attributes=TRUE))) # desired
test(1709.2, !isTRUE(all.equal(x,y,check.attributes=FALSE))) # not desired
x = as.data.table(x)
y = as.data.table(y)
test(1710.1, mode(x[[1]]) == mode(y[[1]]))
test(1710.2, storage.mode(x[[1]]) != storage.mode(y[[1]]))
test(1710.3, class(y[[1]]), c("hello","world"))
test(1710.4, all.equal(x,y,check.attributes=TRUE), # desired
"Datasets have different column classes. First 3: a(numeric!=hello;world)")
test(1710.5, isTRUE(all.equal(x,y,check.attributes=FALSE))) # desired
# Include tests as-is from #1252 (unexpected NA row from logical subsets with 1-row containing NA)
DT = data.table(a=1, d=NA)
test(1711, DT[!is.na(a) & d == "3"], DT[0])
DT = data.table(a = c(1,2), d = c(NA,3))
test(1712, DT[!is.na(a) & d == "3"], DT[2])
test(1713, DT[d==3], DT[2])
# Test new helpful error message suggested by Jan
notAColName = 1
test(1714.1, exists("notAColName")) # use a long column name to be sure it exists and unique
test(1714.2, !exists("notInCallingScope")) # use a long column name to be sure it exists and unique
DT = data.table(a=1:3, b=4:6)
test(1715, DT[,b], 4:6) # old behaviour for sure tested before but here for context
test(1716.1, DT[,notAColName], error="column name 'notAColName' is not found") # ensure it doesn't find it calling scope either
test(1716.2, DT[, ..notInCallingScope], error="Variable 'notInCallingScope' is not found in calling scope")
test(1716.3, DT[, notInCallingScope, with=FALSE], error="Variable 'notInCallingScope' is not found in calling scope")
# Test out-of-bounds error on numeric j
DT = data.table(a=1:3, b=4:6, c=7:9)
test(1717, DT[,4], error="Item 1 of j is 4 which is outside the column number range.*ncol=3")
test(1718, DT[,0], null.data.table())
test(1719, DT[,c(2,0,1)], data.table(b=4:6, a=1:3))
test(1720.1, DT[,c(-2,2)], error="j mixes positives and negatives")
test(1720.2, DT[,-2], DT[,c(1,3)])
test(1720.3, DT[,c(-2,-3)], DT[,1])
test(1720.4, DT[,-(1:3)], null.data.table())
test(1721, DT[,c(1,0,5)], error="Item 3 of j is 5 which.*ncol=3") # to check it says Item 3 even though 0 gets removed internally
# Tests to ensure auto with=FALSE of ! and - only allow symbols around : (i.e. DT[,!(colB:colE)] and not any other symbol usage inside ! and -. Thanks to Mark L #1864 and confirmed by Michael C with both tests added as-is
DT = data.table(FieldName = c(1,2,NA,4,NA,6), rowId=1:6, removalIndex=c(2,7,0,5,10,0))
test(1722.1, DT[,!is.na(as.numeric(FieldName))], c(TRUE,TRUE,FALSE,TRUE,FALSE,TRUE))
test(1722.2, DT[,(!is.na(as.numeric(FieldName)))], c(TRUE,TRUE,FALSE,TRUE,FALSE,TRUE))
test(1723.1, DT[removalIndex>0,rowId-(2*removalIndex-1)], c(-2,-11,-5,-14))
test(1723.2, DT[removalIndex>0,(rowId-(2*removalIndex-1))], c(-2,-11,-5,-14))
DT = data.table(FieldName = c("1", "2", "3", "four", "five", "6"))
test(1724.1, DT[, is.na(as.numeric(FieldName))], c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE), warning="NAs introduced by coercion")
test(1724.2, DT[, !is.na(as.numeric(FieldName))], c(TRUE,TRUE,TRUE,FALSE,FALSE,TRUE), warning="NAs introduced by coercion")
# Ensure NA's are added properly when a new column is added, not all the target rows are joined to, and the number of i
# rows is equal or greater than the number of rows in the target table.
DT = data.table(a=1:3, key="a")
DT[.(4), add0:=1.1][] # didn't break due to 95e438c on 29 Sep 2016
DT[.(c(3,4)), add1:=1.1][] # didn't break
DT[.(c(3,3,4)), add2:=1.1][] # did break
DT[.(2:4), add3:=1.1][] # did break
test(1725, DT, data.table(a=1:3, add0=NA_real_, add1=c(NA,NA,1.1), add2=c(NA,NA,1.1), add3=c(NA,1.1,1.1), key="a"))
# keyby= runs groups in sorted order, #606. Only relevant when j does something that depends on previous group, perhaps
# by using <<-. To run in appearance order use by=. See also #1880.
# It wasn't useful to always run groups in appearance order. Now we have the option and it's consistent.
DT = data.table(grp=rep(3:1,each=3), val=1:9)
lastGrp = 0L
test(1726.1, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, keyby=grp],
data.table(grp=1:3, ans=c(8,12,6), GRP=1:3, key="grp") )
test(1726.2, lastGrp, 1L)
lastGrp = -1L
test(1726.3, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, by=grp],
data.table(grp=3:1, ans=c(1,6,12), GRP=1:3) )
test(1726.4, lastGrp, 7L)
rm(lastGrp)
# better := verbose messages, #1808
DT = data.table(a = 1:10)
test(1727.1, DT[a < 5, a := 5L, verbose=TRUE], output="Assigning to 4 row subset of 10 rows")
test(1727.2, DT[a < 5, a := 5L, verbose=TRUE], output="No rows match i.*Assigning to 0 row subset of 10 rows")
test(1727.3, DT[0, d:=1, verbose=TRUE], data.table(a=c(rep(5L,5L),6:10), d=NA_real_),
output = "Assigning to 0 row subset of 10 rows.*Added 1 new column initialized with all-NA")
test(1727.4, DT[.(a=11L), on="a", c("f","g"):=.(1L,"dummy"), verbose=TRUE],
data.table(a=c(rep(5L,5L),6:10), d=NA_real_, f=NA_integer_, g=NA_character_),
output = "Assigning to 0 row subset of 10 rows.*Added 2 new columns initialized with all-NA")
# Add test for working and no problem na.last=NA with subgroup size 2 containing 1 NA
# and 2 randomly not working cases with na.last=NA size 2 with 1 NA, due to using uninitialized memory
DT = data.table(x=INT(2,2,2,1,1), y=INT(1,NA,3,2,NA))
test(1728.01, DT[order(x,y,na.last=TRUE)], data.table(x=INT(1,1,2,2,2), y=INT(2,NA,1,3,NA)))
test(1728.02, DT[order(x,y,na.last=FALSE)], data.table(x=INT(1,1,2,2,2), y=INT(NA,2,NA,1,3)))
test(1728.03, DT[order(x,y,na.last=NA)], data.table(x=INT(1,2,2), y=INT(2,1,3)))
# 1 row
DT = data.table(x=NA_integer_, y=1)
test(1728.04, DT[order(x,y,na.last=TRUE)], DT)
test(1728.05, DT[order(x,y,na.last=FALSE)], DT)
test(1728.06, DT[order(x,y,na.last=NA)], DT[0])
# 2 row with 1 NA
DT = data.table(x=as.integer(c(NA,1)), y=2:3)
test(1728.07, DT[order(x,y,na.last=TRUE)], DT[c(2,1)])
test(1728.08, DT[order(x,y,na.last=FALSE)], DT)
test(1728.09, DT[order(x,y,na.last=NA)], DT[2]) # was randomly wrong
test(1728.10, DT[order(x,na.last=TRUE)], DT[c(2,1)])
test(1728.11, DT[order(x,na.last=FALSE)], DT)
test(1728.12, DT[order(x,na.last=NA)], DT[2]) # was randomly wrong
# fwrite wrong and crash on 9.9999999999999982236431605, #1847
if (test_longdouble) { #3258
old = options(datatable.verbose=FALSE) # capture.output() exact tests must not be polluted with verbosity
test(1729.01, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))),
output="V1,V2\n1,10")
test(1729.02, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))),
output="V2,V1\n10,1")
DT = data.table(V1=c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9,
0.000000000000000000000999999999999999999999999,
99999999999999999999999999999.999999))
ans = "V1\n9999999999.99\n9.9e-16\n9e-22\n0.9\n9\n9.1\n99.9\n1e-21\n1e+29"
test(1729.03, fwrite(DT), output=ans)
test(1729.04, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans)
# same decimal/scientific rule (shortest format) as write.csv
DT = data.table(V1=c(-00000.00006, -123456789.123456789,
seq.int(-1000,1000,17),
seq(-1000,1000,pi*87),
-1.2345678912345 * 10^(c((-30):30)),
+1.2345678912345 * 10^(c((-30):30)),
-1.2345 * 10^((-20):20),
+1.2345 * 10^((-20):20),
-1.7 * 10^((-20):20),
+1.7 * 10^((-20):20),
-7 * 10^((-20):20),
+7 * 10^((-20):20),
0, NA, NaN, Inf, -Inf,
5.123456789e-290, -5.123456789e-290,
5.123456789e-307, -5.123456789e-307,
5.123456789e+307, -5.123456789e+307))
test(1729.05, nrow(DT), 507L)
x = capture.output(fwrite(DT,na="NA"))[-1] # -1 to remove the column name V1
y = capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))[-1]
# One mismatch that seems to be accuracy in base R's write.csv
# tmp = cbind(row=1:length(x), `fwrite`=x, `write.csv`=y)
# tmp[x!=y,]
# row fwrite write.csv
# 177 "-1234567891234500000" "-1234567891234499840"
# 238 "1234567891234500000" "1234567891234499840"
# looking in surrounding rows for the first one shows the switch point :
# tmp[175:179,]
# row fwrite write.csv
# 175 "-12345678912345000" "-12345678912345000" # ok
# 176 "-123456789123450000" "-123456789123450000" # ok
# 177 "-1234567891234500000" "-1234567891234499840" # e+18 last before switch to scientific
# 178 "-1.2345678912345e+19" "-1.2345678912345e+19" # ok
# 179 "-1.2345678912345e+20" "-1.2345678912345e+20" # ok
test(1729.06, x[c(177,238)], c("-1234567891234500000","1234567891234500000"))
x = x[-c(177,238)]
y = y[-c(177,238)]
test(1729.07, length(x), 505L)
test(1729.08, x, y)
if (!identical(x,y)) print(data.table(row=1:length(x), `fwrite`=x, `write.csv`=y)[x!=y])
DT = data.table(c(5.123456789e+300, -5.123456789e+300,
1e-305,1e+305, 1.2e-305,1.2e+305, 1.23e-305,1.23e+305))
ans = c("V1","5.123456789e+300","-5.123456789e+300",
"1e-305","1e+305","1.2e-305","1.2e+305","1.23e-305","1.23e+305")
# explicitly check against ans rather than just comparing fwrite to write.csv so that :
# i) we can easily see intended results right here in future without needing to run
# ii) we don't get a false pass if fwrite and write.csv agree but are both wrong because of
# a problem with the test mechanism itself or something else strange or unexpected
# Exactly the same binary representation on both linux and windows (so any differences in
# output are not because the value itself is stored differently) :
test(1729.09, binary(DT[[1]]),
c("0 11111100101 111010011010000100010111101110000100 11110100 00000100",
"1 11111100101 111010011010000100010111101110000100 11110100 00000100",
"0 00000001001 110000010110110001011100010100100101 00110101 01110101",
"0 11111110100 001000111010010100010110111010000010 11011001 10111010",
"0 00000001010 000011011010011101101010100101111100 10111001 10101101",
"0 11111110100 010111011111100101001110101100000011 01101011 10101100",
"0 00000001010 000101000110010100110011101010000110 00111110 01010001",
"0 11111110100 011001101011100100100011110110110000 01001110 01011101"))
test(1729.10, fwrite(DT,na=""), output=ans)
test(1729.11, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans)
DT = data.table(unlist(.Machine[c("double.eps","double.neg.eps","double.xmin","double.xmax")]))
# double.eps double.neg.eps double.xmin double.xmax
# 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308
test(1729.12, typeof(DT[[1L]]), "double")
test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)))
options(old) # restore the previous datatable.verbose value, for example for the CRAN_Release test with verbose on
}
if (test_bit64) {
test(1730.1, typeof(-2147483647L), "integer")
test(1730.2, as.integer(-2147483648), NA_integer_, warning="coercion")
test(1730.3, as.integer("-2147483647"), -2147483647L)
test(1730.4, as.integer("-2147483648"), NA_integer_, warning="coercion")
test(1730.5, as.integer64("-2147483648"), as.integer64(-2147483648))
# Currently bit64 truncs to extremes in character coercion. Don't test that in case bit64 changes in future.
# as.integer64("-9223372036854775808") == NA
# as.integer64("-9223372036854775999") == NA
# as.integer64("+9223372036854775808") == 9223372036854775807
# as.integer64("+9223372036854775999") == 9223372036854775807
DT = data.table( as.integer64(c(
"-9223372036854775807", # integer64 min 2^63-1
"+9223372036854775807", # integer64 max
"-9223372036854775806","+9223372036854775806", # 1 below extreme just to check
"0","-1","1",
"NA",NA,
"-2147483646", # 1 below extreme to check
"-2147483647", # smallest integer in R
"-2147483648", # NA_INTEGER == INT_MIN but valid integer64
"-2147483649",
"+2147483646", # positives as well just in case
"+2147483647",
"+2147483648",
"+2147483649"
)))
ans = c('"V1"',"-9223372036854775807","9223372036854775807","-9223372036854775806","9223372036854775806",
"0","-1","1","__NA__","__NA__",
"-2147483646","-2147483647","-2147483648","-2147483649",
"2147483646","2147483647","2147483648","2147483649")
test(1731.1, class(DT[[1L]]), "integer64")
test(1731.2, fwrite(DT,na="__NA__"), output=ans)
f = tempfile()
test(1731.3, fwrite(DT, f, na="__NA__"), NULL)
test(1731.4, readLines(f), ans)
unlink(f)
ans[1] = "V1" # the field is unquoted under `quote=FALSE`
test(1731.5, write.csv(DT,na="__NA__",row.names=FALSE,quote=FALSE), output=ans)
# write.csv works on integer64 because it calls bit64's as.character method
}
# fwrite(,quote='auto' and qmethod)
DT = data.table(x=c("fo,o", "foo", 'b"ar', NA, "", "NA"),
"ColName,WithComma"=1:6,
'Three\nLine\nColName'=c('bar\n', "noNeedToQuote", 'a\nlong\n"sentence"', "0000", " \n ", ' "\n '))
x = capture.output(fwrite(DT,na="NA",quote=TRUE, qmethod='escape'))
y = capture.output(write.table(DT,row.names=FALSE,quote=TRUE,sep=",",qmethod='escape'))
test(1732.1, x, y)
x = capture.output(fwrite(DT,na="NA",quote=TRUE,qmethod='double'))
y = capture.output(write.table(DT,row.names=FALSE,quote=TRUE,sep=",",qmethod='double'))
test(1732.2, x, y)
x = capture.output(fwrite(DT,na="NA",quote=FALSE))
y = capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))
test(1732.3, x, y)
f = tempfile()
fwrite(DT,f,quote='auto',qmethod='escape')
# write.csv / write.table don't do field-by-field quoting so can't compare to them.
ans = c('x,"ColName,WithComma","Three', 'Line', 'ColName"',
'"fo,o",1,"bar','"',
'foo,2,noNeedToQuote',
'"b\\"ar",3,"a', 'long', "\\\"sentence\\\"\"",
',4,0000',
'"",5," ',' "',
"NA,6,\" \\\"", " \"")
test(1732.4, readLines(f), ans)
fwrite(DT,f,quote='auto',qmethod='double')
ans[7] = '"b""ar",3,"a'
ans[9] = "\"\"sentence\"\"\""
ans[13] = "NA,6,\" \"\""
test(1732.5, readLines(f), ans)
DT = data.table(A=c("foo","ba,r","baz"), B=c("AA","BB","CC"), C=c("DD","E\nE","FF"))
test(1732.6, fwrite(DT, quote='auto'), output='A,B,C\nfoo,AA,DD\n"ba,r",BB,"E\nE"\nbaz,CC,FF')
unlink(f)
DT = data.table(A=c(NA, "NA", "", "monty"), B=c(5, 7, 0, NA))
test(1732.7, fwrite(DT, quote='auto'), output='A,B\n,5\nNA,7\n"",0\nmonty,')
test(1732.8, fwrite(DT, quote='auto', na="NA"), output='"A","B"\nNA,5\n"NA",7\n"",0\n"monty",NA')
# dec=","
test(1733.1, fwrite(data.table(pi),dec=","), error="dec != sep is not TRUE")
test(1733.2, fwrite(data.table(c(1.2,-8.0,pi,67.99),1:4),dec=",",sep=";"),
output="V1;V2\n1,2;1\n-8;2\n3,14159265358979;3\n67,99;4")
# fwrite implied and actual row.names
DT = data.table(foo=1:3,bar=c(1.2,9.8,-6.0))
test(1734.1, capture.output(fwrite(DT,row.names=TRUE,quote=FALSE)),
capture.output(write.csv(DT,quote=FALSE)))
test(1734.2, capture.output(fwrite(DT,row.names=TRUE,quote=TRUE)),
capture.output(write.csv(DT)))
test(1734.3, fwrite(DT,row.names=TRUE,quote='auto'), # same other than 'foo' and 'bar' column names not quoted
output="\"\",foo,bar\n\"1\",1,1.2\n\"2\",2,9.8\n\"3\",3,-6")
DF = as.data.frame(DT)
test(1734.4, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)),
capture.output(write.csv(DF,quote=FALSE)))
test(1734.5, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)),
capture.output(write.csv(DF)))
rownames(DF)[2] = "someName"
rownames(DF)[3] = "another"
test(1734.6, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)),
capture.output(write.csv(DF,quote=FALSE)))
test(1734.7, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)),
capture.output(write.csv(DF)))
# list columns and sep2
set.seed(1)
DT = data.table(A=1:4,
B=list(1:10,15:18,7,9:10),
C=list(letters[19:23],c(1.2,2.3,3.4,pi,-9),c("foo","bar"),c(TRUE,TRUE,FALSE)))
test(1736.01, capture.output(fwrite(DT,logical01=FALSE)), c("A,B,C", "1,1|2|3|4|5|6|7|8|9|10,s|t|u|v|w",
"2,15|16|17|18,1.2|2.3|3.4|3.14159265358979|-9", "3,7,foo|bar", "4,9|10,TRUE|TRUE|FALSE"))
test(1736.02, fwrite(DT, sep2=","), error="length(sep2)")
test(1736.03, fwrite(DT, sep2=c("",",","")), error="sep.*,.*sep2.*,.*must all be different")
test(1736.04, fwrite(DT, sep2=c("","||","")), error="nchar.*sep2.*2")
test(1736.05, capture.output(fwrite(DT, sep='|', sep2=c("c(",",",")"), logical01=FALSE)), c("A|B|C", "1|c(1,2,3,4,5,6,7,8,9,10)|c(s,t,u,v,w)",
"2|c(15,16,17,18)|c(1.2,2.3,3.4,3.14159265358979,-9)", "3|c(7)|c(foo,bar)", "4|c(9,10)|c(TRUE,TRUE,FALSE)"))
test(1736.06, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logicalAsInt=TRUE)),
c("A|B|C", "1|{1,2,3,4,5,6,7,8,9,10}|{s,t,u,v,w}",
"2|{15,16,17,18}|{1.2,2.3,3.4,3.14159265358979,-9}", "3|{7}|{foo,bar}", "4|{9,10}|{1,1,0}"))
DT = data.table(A=c("foo","ba|r","baz"))
test(1736.07, capture.output(fwrite(DT,na="")), c("A","foo","ba|r","baz")) # no list column so no need to quote
test(1736.08, capture.output(fwrite(DT)), c("A","foo","ba|r","baz"))
DT = data.table(A=c("foo","ba|r","baz"), B=list(1:3,1:4,c("fo|o","ba,r","baz"))) # now list column and need to quote
test(1736.09, capture.output(fwrite(DT)), c("A,B", "foo,1|2|3", "\"ba|r\",1|2|3|4", "baz,\"fo|o\"|\"ba,r\"|baz"))
test(1736.10, capture.output(fwrite(DT,quote=TRUE)), c("\"A\",\"B\"", "\"foo\",1|2|3", "\"ba|r\",1|2|3|4", "\"baz\",\"fo|o\"|\"ba,r\"|\"baz\""))
# any list of same length vector input
test(1737.1, fwrite(list()), NULL, warning="fwrite was passed an empty list of no columns")
test(1737.2, fwrite(list(1.2)), output="1.2")
test(1737.3, fwrite(list(1.2,B="foo")), output=",B\n1.2,foo")
test(1737.4, fwrite(list("A,Name"=1.2,B="fo,o")), output="\"A,Name\",B\n1.2,\"fo,o\"")
test(1737.5, fwrite(list(1.2,B=c("foo","bar"))), error="Column 2's length (2) is not the same as column 1's length (1)")
# fwrite ITime, Date, IDate
DT = data.table(A=as.ITime(c("23:59:58","23:59:59","12:00:00","00:00:01",NA,"00:00:00")))
test(1738.1, capture.output(fwrite(DT)), c("A","23:59:58","23:59:59","12:00:00","00:00:01","","00:00:00"))
test(1738.2, capture.output(fwrite(DT,na="")), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE, na="")))
dts = c("1901-05-17","1907-10-22","1929-10-24","1962-05-28","1987-10-19","2008-09-15",
"1968-12-30","1968-12-31","1969-01-01","1969-01-02")
DT = data.table(A=as.Date(dts), B=as.IDate(dts))
test(1738.3, sapply(DT,typeof), c(A="double",B="integer"))
test(1738.4, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)))
test(1738.5, as.integer(as.Date(c("0000-03-01","9999-12-31"))), c(-719468L,2932896L))
if (FALSE) {
# Full range takes too long for CRAN.
dts = seq.Date(as.Date("0000-03-01"),as.Date("9999-12-31"),by="day")
dtsCh = as.character(dts) # 36s
dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000
test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31")))
} else {
# test on CRAN a reduced but important range
dts = seq.Date(as.Date("1899-12-31"),as.Date("2100-01-01"),by="day")
dtsCh = as.character(dts)
test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01")))
}
DT = data.table(A=dts, B=as.IDate(dts))
test(1739.3, sapply(DT,typeof), c(A="double",B="integer"))
test(1739.4, typeof(dts), "double")
f = tempfile()
g = tempfile() # Full range
fwrite(DT,f) # 0.092s
write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s
test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=",")))
test(1739.6, readLines(f), readLines(g))
unlink(f)
unlink(g)
rm(list=c("dtsCh","dts"))
gc()
# dateTimeAs
DT = data.table(
A = as.Date(d<-c("1907-10-21","1907-10-22","1907-10-22","1969-12-31","1970-01-01","1970-01-01",
"1972-02-29","1999-12-31","2000-02-29","2016-09-12")),
B = as.IDate(d),
C = as.ITime(t<-c("23:59:59","00:00:00","00:00:01", "23:59:58", "00:00:00","00:00:01",
"12:00:00", "01:23:45", "23:59:59","01:30:30")),
D = as.POSIXct(dt<-paste(d,t), tz="UTC"),
E = as.POSIXct(paste0(dt,c(".999",".0",".5",".111112",".123456",".023",".0",".999999",".99",".0009")), tz="UTC"))
test(1740.0, fwrite(DT,dateTimeAs="iso"), error="dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'")
test(1740.1, fwrite(DT,dateTimeAs=c("ISO","squash")), error="dateTimeAs must be a single string")
test(1740.2, capture.output(fwrite(DT,dateTimeAs="ISO")), c(
"A,B,C,D,E",
"1907-10-21,1907-10-21,23:59:59,1907-10-21T23:59:59Z,1907-10-21T23:59:59.999Z",
"1907-10-22,1907-10-22,00:00:00,1907-10-22T00:00:00Z,1907-10-22T00:00:00Z",
"1907-10-22,1907-10-22,00:00:01,1907-10-22T00:00:01Z,1907-10-22T00:00:01.500Z",
"1969-12-31,1969-12-31,23:59:58,1969-12-31T23:59:58Z,1969-12-31T23:59:58.111112Z",
"1970-01-01,1970-01-01,00:00:00,1970-01-01T00:00:00Z,1970-01-01T00:00:00.123456Z",
"1970-01-01,1970-01-01,00:00:01,1970-01-01T00:00:01Z,1970-01-01T00:00:01.023Z",
"1972-02-29,1972-02-29,12:00:00,1972-02-29T12:00:00Z,1972-02-29T12:00:00Z",
"1999-12-31,1999-12-31,01:23:45,1999-12-31T01:23:45Z,1999-12-31T01:23:45.999999Z",
"2000-02-29,2000-02-29,23:59:59,2000-02-29T23:59:59Z,2000-02-29T23:59:59.990Z",
"2016-09-12,2016-09-12,01:30:30,2016-09-12T01:30:30Z,2016-09-12T01:30:30.000900Z"))
test(1740.3, capture.output(fwrite(DT,dateTimeAs="squash")), c(
"A,B,C,D,E",
"19071021,19071021,235959,19071021235959000,19071021235959999",
"19071022,19071022,000000,19071022000000000,19071022000000000",
"19071022,19071022,000001,19071022000001000,19071022000001500",
"19691231,19691231,235958,19691231235958000,19691231235958111",
"19700101,19700101,000000,19700101000000000,19700101000000123",
"19700101,19700101,000001,19700101000001000,19700101000001023",
"19720229,19720229,120000,19720229120000000,19720229120000000",
"19991231,19991231,012345,19991231012345000,19991231012345999",
"20000229,20000229,235959,20000229235959000,20000229235959990",
"20160912,20160912,013030,20160912013030000,20160912013030000"))
test(1740.4, capture.output(fwrite(DT,dateTimeAs="epoch")), c(
"A,B,C,D,E",
"-22718,-22718,86399,-1962748801,-1962748800.001",
"-22717,-22717,0,-1962748800,-1962748800",
"-22717,-22717,1,-1962748799,-1962748798.5",
"-1,-1,86398,-2,-1.888888",
"0,0,0,0,0.123456",
"0,0,1,1,1.023",
"789,789,43200,68212800,68212800",
"10956,10956,5025,946603425,946603425.999999",
"11016,11016,86399,951868799,951868799.99",
"17056,17056,5430,1473643830,1473643830.0009"))
test(1741.1, attr(DT[[4]],"tzone"), "UTC")
test(1741.2, attr(DT[[5]],"tzone"), "UTC")
# Remove tzone attribute to make write.csv write in local time.
# That local time will vary on the boxes this test runs on, so we just compare to
# write.csv rather than fixed strings as above.
setattr(DT[[4]], "tzone", NULL)
setattr(DT[[5]], "tzone", NULL)
# format() now supports digits = 0, to display nsmall decimal places.
options(digits.secs=0)
test(1741.3, x1<-capture.output(fwrite(DT,dateTimeAs="write.csv")),
capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)))
options(digits.secs=3)
test(1741.4, x2<-capture.output(fwrite(DT,dateTimeAs="write.csv")),
capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)))
options(digits.secs=6)
test(1741.5, x3<-capture.output(fwrite(DT,dateTimeAs="write.csv")),
capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)))
# check that extra digits made it into output
test(1741.6, sum(nchar(x1)) < sum(nchar(x2)) && sum(nchar(x2)) < sum(nchar(x3)))
# fread should properly handle NA in colClasses argument #1910
test(1743.01, sapply(fread("a,b\n3,a", colClasses=c(NA, "factor")), class), c(a="integer", b="factor"))
test(1743.02, sapply(fread("a,b\n3,a", colClasses=c(NA, NA)), class), c(a="integer", b="character"))
test(1743.03, fread("a,b\n1,a", colClasses=c(NA, TRUE)), error="colClasses is.*logical.*but it has some TRUE or FALSE.*not allowed")
test(1743.04, fread("a,b\n1,a", colClasses=c("character", "factor")), data.table(a="1", b=factor("a")))
# and the length-1 character case; #4237
test(1743.041, fread("a,b\n1,a", colClasses=NA_character_), data.table(a=1L, b="a"))
test(1743.042, fread("a,b\n1,a", colClasses=""), data.table(a=1L, b="a"))
test(1743.043, fread("a\n1", colClasses=NA_character_), data.table(a=1L))
test(1743.044, fread("a\n1", colClasses=""), data.table(a=1L))
# Issue #1634: 'fread doesn't check colClasses to be valid type'
# Currently using BioGenerics, which doesn't support USE.NAMES
## Date supported character/list colClasses
test(1743.05, sapply(fread("a,b\n2017-01-01,1", colClasses=c("Date", "integer")), class), c(a="Date", b="integer"))
test(1743.06, sapply(fread("a,b\n2017-01-01,1", colClasses=list(Date = 1L, integer = 2L)), class), c(a="Date", b="integer"))
test(1743.07, sapply(fread("a,b\n2017-01-01,2017-01-02", colClasses=list(Date = 1:2)), class), c(a="Date", b="Date"))
test(1743.08, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex"))
test(1743.09, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex"))
test(1743.10, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", NA)), class), c(a="Date",b="integer",c="complex",d="integer"))
test(1743.11, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", "raw")), class), c(a="Date",b="integer",c="complex",d="raw"))
test(1743.121, sapply(fread("a,b\n2015-01-01,2015-01-01", colClasses=c(NA,"IDate")), inherits, what="IDate"), c(a=TRUE, b=TRUE))
test(1743.122, fread("a,b\n2015-01-01,2015-01-01", colClasses=c("POSIXct","Date")), data.table(a=as.POSIXct("2015-01-01"), b=as.Date("2015-01-01")))
test(1743.123, fread("a,b\n1+3i,2015-01-01", colClasses=c(NA,"IDate")), data.table(a="1+3i", b=as.IDate("2015-01-01")))
## Attempts to impose incompatible colClasses is a warning (not an error)
## and does not change the value of the columns
test(1743.13, sapply(fread("a,b\n09/05/98,2015-01-01", colClasses = "Date"), class), y=c(a="character", b="Date"), warning="standard unambiguous format")
## Just invalid
test(1743.14, sapply(fread("a,b\n2017-01-01,1", colClasses=c("foo", "integer")), class), c(a="character", b="integer"), warning="[nN]o method .*for .*foo")
test(1743.15, sapply(fread("a,b\n2017-01-01,1", colClasses=c("foo", "integer")), class), c(a="character", b="integer"), warning="the column has been left as type .*character")
test(1743.16, sapply(fread("a,b\n2017-01-01,2", colClasses=list(foo=1)), class), c(a="character", b="integer"), warning="the column has been left as type .*character")
## When colClasses is a list, it should still work if factor is a name of multiple elements.
test(1743.17, sapply(fread("a,b\nc,d", colClasses = list(factor = 1L, factor = 2L)), class), c(a="factor", b="factor"))
test(1743.18, sapply(fread("a,b,c,d\nw,x,y,z", colClasses = list(factor = 1L, factor = 2L, factor = "c")), class), c(a="factor", b="factor", c ="factor", d="character"))
## When colClasses is used with select, integers in colClasses refer to the original column order
test(1743.191, sapply(fread("a,b,c\nd,e,f", colClasses = list(Date = 1L), select = 2:3), class), y = c(b="character", c="character"))
test(1743.192, sapply(fread("a,b,c,d\n2,2,0f,x", colClasses = list(raw = c(1L, 3L), Date = 4), select = 3:2), class), y = c(c="raw", b="integer"))
test(1743.193, sapply(fread("a,b,c,d\n2,2,0f,x", colClasses = list(raw = c(1L, 3L), Date = 4), drop = c(1, 4)), class), y = c(b="integer", c="raw"))
test(1743.194, sapply(fread("a,b,c,d\n2,2,0f,x", colClasses = list(raw = c("a", "c"), Date = "d"), drop = c(1L, 4L)), class), y = c(b="integer", c="raw"))
test(1743.195, sapply(fread("a,b,c,d\n2,2,0f,x", colClasses = list(raw = c("a", "c"), Date = "d"), select = c(2L, 3L)), class), y = c(b="integer", c="raw"))
test(1743.196, sapply(fread("a,b,c,d\n2,0+1i,2,x", colClasses = list(raw = c("a", "c"), complex = "b", Date = "d"), select = c(2L, 3L)), class), y = c(b="complex", c="raw"))
# colClasses in select; #1426
test(1743.197, fread("A,B,C,D\nA,B,X,4", select=c(1,4,3,2), colClasses=c("factor","factor","character","integer")), ans<-data.table(A=factor("A"), D=4L, C="X", B=factor("B")))
test(1743.198, fread("A,B,C,D\nA,B,X,4", select=list(c(1,4,3,2), c("factor","integer","character","factor"))), error="select= is type list but has no names")
test(1743.199, fread("A,B,C,D\nA,B,X,4", select=c(A="factor", D="integer", C="character", B="factor")), ans)
test(1743.200, fread("A,B,C,D\nA,B,X,4", select=list(factor="A", integer=4, character="C", factor=2)), ans) # all 4 columns but in different order by list form of select
test(1743.201, fread("A,B,C,D\nA,B,X,4", select=list(factor="A"), colClasses="character"), error="select= is type list.*but colClasses= has been provided as well. Please remove colClasses.")
test(1743.202, fread("A,B,C,D\nA,B,X,4", select=c(A="factor"), colClasses="character"), error="select= is a named vector.*but colClasses= has been provided as well. Please remove colClasses=.")
test(1743.203, fread("A,B,C,D\nA,B,X,4", select=list(character="D", factor="B")), data.table(D="4", B=factor("B")))
test(1743.204, fread("A,B,C,D\nA,B,X,4", select=list(character=4, character=2)), data.table(D="4", B="B"))
## factors
test(1743.211, sapply(fread("a,b,c\n2,2,f", colClasses = list(factor = 1L), select = 2:3), class), y = c(b="integer", c="character"))
test(1743.212, sapply(fread("a,b,c\n2,2,f", colClasses = list(factor = c(1L, 3L)), select = 2:3), class), y = c(b="integer", c="factor"))
## concordance with #1445 - respect select's order
test(1743.213, sapply(fread("a,b,c\n2,2,f", colClasses = list(factor = c(1L, 3L)), select = 3:2), class), y = c(c="factor", b="integer"))
test(1743.214, sapply(fread("a,b,c,d\n2,2,f,x", colClasses = list(factor = c(1L, 3L), Date = 4), select = 3:2), class), y = c(c="factor", b="integer"))
test(1743.215, sapply(fread("a,b,c\n2,2,f", colClasses = c("integer", "integer", "factor"), select=2:3), class), y = c(b="integer", c="factor"))
test(1743.216, sapply(fread("a,b\n1,x", colClasses = list(factor = "b"), select = "b"), class), c(b = "factor"))
test(1743.217, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor = c(1, 2, 4), factor = 3), select = c(5, 6, 2, 4)), class), y = c(e = "character", f = "character", b = "factor", d = "factor"))
test(1743.218, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor = c(1, 2, 4), factor = 3), select = c(5, 4, 2, 3)), class), y = c(e = "character", d = "factor", b = "factor", c = "factor"))
test(1743.22, fread("a,b,c\n1999/01/01,2,f", colClasses=list(Date=1L), drop="a"), data.table(b=2L, c="f"))
test(1743.231, fread("a,b,c\n2,1,4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4i"), warning="NAs introduced by coercion.*left as type 'character'")
test(1743.232, fread("a,b,c\n2,1,3+4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c=3+4i))
test(1743.241, fread("a,b,c\n2,2,f", colClasses = list(character="c", integer="b"), drop="a"), data.table(b=2L, c="f"))
test(1743.242, fread("a,b,c\n2,2,f", colClasses = c("integer", "integer", "factor"), drop="a"), data.table(b=2L, c=factor("f")))
## POSIXct
tt = Sys.getenv("TZ", unset=NA)
TZnotUTC = !identical(tt,"") && !is_utc(tt)
if (TZnotUTC) {
# from v1.13.0 these tests work when running under non-UTC because they compare to as.POSIXct which reads these unmarked datetime in local
# the new tests 2150.* cover more cases
test(1743.25, fread("a,b,c\n2015-06-01 11:00:00,1,ae", colClasses=c("POSIXct","integer","character")), data.table(a=as.POSIXct("2015-06-01 11:00:00"),b=1L,c="ae"))
test(1743.26, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character="b"), drop=c("a","b"), logical01=TRUE),
ans<-data.table(c=as.POSIXct("2015-06-01 11:00:00"), d="a", e=1.5, f="M", g=9L, h=FALSE))
test(1743.27, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character=2), drop=c("a","b"), logical01=TRUE),
ans)
}
## raw same behaviour as read.csv
test(1743.28, sapply(fread("a,b\n05,05", colClasses = c("raw", "integer")), class), sapply(read.csv(text ="a,b\n05,05", colClasses = c("raw", "integer")), class))
test(1743.29, sapply(fread("a,b\n05,05", colClasses = list(raw = 1L)), class), sapply(read.csv(text = "a,b\n05,05", colClasses = c("raw", "integer")), class))
## colClasses Examples
data1743 = "A,B,C,D\n1,3,5,7\n2,4,6,8\n"
test(1743.301, fread(data1743, colClasses=c("B"="NULL","C"="NULL")), ans<-data.table(A=1:2, D=7:8))
test(1743.302, fread(data1743, colClasses=list(NULL=c("B","C"))), ans)
test(1743.303, fread(data1743, drop=c("B","C")), ans)
test(1743.3041, fread(data1743, drop=2:3), ans)
test(1743.3042, fread(data1743, drop=c(2,NA,3)), ans, warning="drop[2] is NA")
test(1743.3043, fread(data1743, colClasses=list(NULL=c(2,NA,3))), ans, warning="colClasses[[1]][2] is NA")
test(1743.305, fread(data1743, colClasses=c("integer", "NULL", "NULL", "integer")), ans)
test(1743.306, fread(data1743, colClasses=c("integer", "NULL", "NULL", "integer"), drop=4), data.table(A=1:2))
test(1743.3071, fread(data1743, colClasses=list(NULL=c("C","D"), NULL=1:2)), data.table(NULL))
test(1743.3072, fread(data1743, colClasses=list(NULL=c("C","D"), NULL=1)), data.table(B=3:4))
test(1743.308, fread(data1743, colClasses=list(NULL=c("C","D")), drop=1:2), data.table(NULL))
test(1743.311, fread(data1743, colClasses="NULL"), ans<-data.table(A=1:2, B=3:4, C=5:6, D=7:8), warning="colClasses.*quoted.*interpreted as colClasses.*NULL")
test(1743.312, fread(data1743, colClasses=character()), ans)
test(1743.32, fread("A,B\na,0+1i", colClasses="complex"), data.table(A="a", B=1i),
warning="Column 'A' was requested to be 'complex'.*NAs introduced by coercion.*column has been left as.*character")
test(1743.33, fread(data1743, colClasses=list("character"=4, "numeric"=c(2,NA,1))), data.table(A=c(1,2), B=c(3,4), C=5:6, D=c("7","8")), warning="colClasses[[2]][2] is NA")
test(1743.34, fread(data1743, select=list("character"=4, "numeric"=c(2,NA,1))), data.table(D=c("7","8"), B=c(3,4), A=c(1,2)), warning="colClasses[[2]][2] is NA")
old = options(warn=2)
test(1743.35, fread(data1743, select=list("character"=4, "numeric"=c(2,NA,1))), error="colClasses[[2]][2] is NA")
options(old)
# stringsAsFactors = double; #2025
fwrite(data.table(V1 = sample(letters, size=26*10, replace=TRUE),
V2 = paste0("A", c(1:259, 1)),
V3 = sample.int(26, n=260, replace=TRUE)), f<-tempfile(fileext = ".csv"))
test(1743.411, class(fread(f)[[1]]), "character")
test(1743.412, class(fread(f, stringsAsFactors=0)[[1]]), "character")
test(1743.413, class(fread(f, stringsAsFactors=0.10, verbose=TRUE)[[1]]), "character", output="stringsAsFactors=0.1 converted 0 column")
test(1743.414, class(fread(f, stringsAsFactors=0.11, verbose=TRUE)[[1]]), "factor", output="stringsAsFactors=0.11 converted 1 column")
test(1743.415, class(fread(f, stringsAsFactors=0.12)[[1]]), "factor")
test(1743.416, class(fread(f, stringsAsFactors=1.00)[[1]]), "factor")
unlink(f)
# rolling join stopped working for double with fractions, #1904
DT = data.table(A=c(1999.917,2000.417,2000.917,2001.417,2001.917))
setkey(DT,A)
x = c(2000.167,2000.417,2000.667,2000.917,2001.167)
test(1744.1, DT[.(x),roll=FALSE,which=TRUE], INT(NA,2,NA,3,NA))
test(1744.2, DT[.(x),roll=TRUE, which=TRUE], INT(1,2,2,3,3))
test(1744.3, DT[.(x),roll=1/12, which=TRUE], INT(NA,2,NA,3,NA))
# 0's at the end of a non-empty subset of empty DT, #1937
test(1745.1, data.table(a=character(0))[c(1,0)], data.table(a=NA_character_))
test(1745.2, data.table(a=numeric(0))[c(1,0)], data.table(a=NA_real_))
test(1745.3, data.table(a=integer(0))[c(1,0)], data.table(a=NA_integer_))
# Long standing crash when by=.EACHI, nomatch=0, the first item in i has no match
# AND j has function call that is passed a key column, #1933.
DT = data.table(A=letters[1:5],B=1:5,key="A")
ids = c("p","q","r","c","s","d")
test(1746.1, DT[ids, A, by=.EACHI, nomatch=0], data.table(A=c("c","d"),A=c("c","d"))) # was always ok
test(1746.2, DT[ids, print(A), by=.EACHI, nomatch=0], # reliable crash in v1.9.6 and v1.9.8
data.table(A=character(0)), output="\"c\".*\"d\"")
test(1746.3, DT[ids, {print(A);A}, by=.EACHI, nomatch=0], # reliable crash in v1.9.6 and v1.9.8
data.table(A=c("c","d"),V1=c("c","d")), output="\"c\".*\"d\"")
# combining on= with by= and keyby=, #1943
freshDT = data.table(x = rep(c("a", "b"), each = 4), y = 1:0, z = c(3L, 6L, 8L, 5L, 4L, 1L, 2L, 7L))
DT = copy(freshDT)
test(1747.01, DT["b", max(z), by = y, on = "x"], ans1<-data.table(y=1:0, V1=c(4L,7L)))
test(1747.02, DT["b", max(z), keyby = y, on = "x"], ans2<-data.table(y=0:1, V1=c(7L,4L), key="y"))
test(1747.03, DT[x=="b", max(z), by = y], ans1)
test(1747.04, DT[x=="b", max(z), keyby = y], ans2)
DT = copy(freshDT) # to clear any auto indexes
test(1747.05, DT[x=="b", max(z), by = y], ans1)
test(1747.06, DT[x=="b", max(z), keyby = y], ans2)
setkey(DT, x)
test(1747.07, DT["b", max(z), by = y], ans1)
test(1747.08, DT["b", max(z), keyby = y], ans2)
DT = copy(freshDT) # and agin without the == having run before the setkey
setkey(DT, x)
test(1747.09, DT["b", max(z), by = y], ans1)
test(1747.10, DT["b", max(z), keyby = y], ans2)
DT = as.data.table(mtcars[mtcars$cyl %in% c(6, 8), c("am", "vs", "hp")])
test(1748.1, DT[.(0), max(hp), by = vs, on = "am"], ans1<-data.table(vs=c(1,0), V1=c(123,245)))
test(1748.2, DT[.(0), max(hp), keyby = vs, on = "am"], ans2<-data.table(vs=c(0,1), V1=c(245,123), key="vs"))
DT = as.data.table(mtcars[mtcars$cyl %in% c(6, 8), c("am", "vs", "hp")])
test(1748.3, DT[am==0, max(hp), by=vs], ans1)
test(1748.4, DT[am==0, max(hp), keyby=vs], ans2)
# indices() can return list of vectors, #1589
DT = data.table(A=5:1,B=letters[5:1])
setindex(DT)
setindex(DT, A)
setindex(DT, B)
indices(DT, vectors = TRUE)
test(1749.1, indices(DT), c("A__B","A","B"))
test(1749.2, indices(DT, vectors = TRUE), list(c("A","B"),"A","B"))
# Grouping Sets #1377
n = 24L
set.seed(25)
dt <- data.table(
color = sample(c("green","yellow","red"), n, TRUE),
year = as.Date(sample(paste0(2011:2015,"-01-01"), n, TRUE)),
status = as.factor(sample(c("removed","active","inactive","archived"), n, TRUE)),
amount = sample(1:5, n, TRUE),
value = sample(c(3, 3.5, 2.5, 2), n, TRUE)
)
test(1750.01, # empty input gets grand total only when asked in `sets` with `character()`
groupingsets(dt[0L], j = sum(value), by = c("color","year","status"), sets=list(c("color"))),
data.table(color=character(), year=as.Date(NA)[-1L], status=factor(), V1=numeric())
)
test(1750.02, # empty input gets grand total non-NA, if asked. Was affected by as.factor(NA) in R 2.15.0
groupingsets(dt[0L], j = sum(value), by = c("color","year","status"), sets=list(c("color"), character())),
data.table(color=NA_character_, year=as.Date(NA), status=as.factor(NA), V1=0)
)
test(1750.03, # empty input non-NA grand total, also retain classes and aggregation level
groupingsets(dt[0L], j = lapply(.SD, sum), by = c("color","year","status"), sets=list(c("color"), character()), id=TRUE),
data.table(grouping=7L, color=NA_character_, year=as.Date(NA), status=as.factor(NA), amount=0L, value=0)
)
test(1750.04, # `sets=list()` produces 0 nrow, for grand total use `set=list(character())` - test at top
nrow(groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), sets=list(), id=TRUE)),
0L
)
test(1750.05, # `by` must have unique column names
groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status","year"), .SDcols=c("amount","value"), sets=list("year"), id=TRUE),
error = "Argument 'by' must have unique column names"
)
test(1750.06, # 0 ncol `x`
groupingsets(data.table(), j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), sets=list(c("year")), id=TRUE),
error = "Argument 'x' is a 0-column data.table; no measure to apply grouping over."
)
test(1750.07, # 0 length `by`, must also use `sets=list()`, so 0L rows result
nrow(groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = character(), .SDcols=c("amount","value"), sets=list(), id=TRUE)),
0L
)
test(1750.08, all( # for any single value from dataset there should be always same aggregate result on any level of grouping
sapply(seq_len(nrow(dt)), function(i) uniqueN(
groupingsets(dt[i], j = lapply(.SD, sum), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character())),
by=c("amount","value")
)) == 1L
), TRUE)
# all grouping id matches in all totals
r = groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color","year","status"), c("year"), c("status"), character()), id=TRUE)
test(1750.09, uniqueN(
r[, lapply(.SD, sum), by = "grouping", .SDcols = c("cnt","amount","value")],
by = c("cnt","amount","value")
), 1L)
# groupingsets grouping by 'value' still possible
r = groupingsets(dt, j = sum(amount), by = c("color","year","status","value"), sets=list(c("color","year","status"), c("year"), c("status"), character()))
test(1750.10,
sapply(r, class),
c("color"="character","year"="Date","status"="factor","value"="numeric","V1"="integer")
)
# groupingsets on aggregate using grouping col char type and sum - error
test(1750.11,
groupingsets(dt, j = lapply(.SD, sum), by = c("status","year"), sets=list(character()), .SDcols="color"),
error = "invalid 'type' (character) of argument"
)
# groupingsets on aggregate using grouping col factor type and sum - error
test(1750.12,
groupingsets(dt, j = lapply(.SD, sum), by = c("color","year"), sets=list(character()), .SDcols="status"),
error = "not meaningful for factors"
)
# groupingsets on aggregate using grouping col char type and length, match on all subtotals
r = groupingsets(dt, j = lapply(.SD, length), by = c("status","year"), sets=list(c("year"), c("status","year"), character()), .SDcols="color", id=TRUE)
test(1750.13, uniqueN(
r[, lapply(.SD, sum), by = "grouping", .SDcols = c("color")],
by = c("color")
), 1L)
# groupingsets double listing column, to measure and grouping
test(1750.14,
groupingsets(dt, j = lapply(.SD, sum), by = c("color","amount"), sets=list(c("color"), c("color","amount")), .SDcols="amount", id=TRUE),
error = "There exists duplicated column names in the results"
)
test(1750.15,
groupingsets(dt, j = .(color = sum(value)), by = c("color","amount"), sets=list(c("color"), c("color","amount")), id=TRUE),
error = "There exists duplicated column names in the results"
)
# set equals to `character(0)` should return grand total
sets = list(character())
test(1750.16,
groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE),
dt[, c(list(cnt=.N), lapply(.SD, sum)), .(grouping=rep(7L,n), color=rep(NA_character_,n), year=rep(as.Date(NA),n), status=as.factor(rep(NA_character_,n)))]
)
# duplicate entries in `sets` vector-wise
sets = list("color", c("color","year","status","year","status"), "year", character())
test(1750.17,
groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE),
error = "Character vectors in 'sets' list must not have duplicated column names within a single grouping set."
)
# duplicate entries in `sets` - double counting - actually aggregate `grouping!=5L` (not double counted) to compare to double counted values on `grouping==5L`, as double counting is expected results for this unexpected usage
sets = list("year", c("color","year"), "year", character())
test(1750.18, uniqueN({
r <- groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE)
r[, lapply(.SD, sum), by = .(double_counting = grouping==5L, double_counting = grouping!=5L), .SDcols = c("cnt","amount","value")]
}, by = c("cnt","amount","value")
), 1L, warning = "'sets' contains a duplicate")
# duplicate entries in `sets` but reorderd - double counting on `grouping==1L`
sets = list(c("color","year"), "year", c("year","color"), character())
test(1750.19, uniqueN({
r <- groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE)
r[, lapply(.SD, sum), by = .(double_counting = grouping==1L, double_counting = grouping!=1L), .SDcols = c("cnt","amount","value")]
}, by = c("cnt","amount","value")
), 1L, warning = "'sets' contains a duplicate")
# entries in `by` / `sets` not exists in data.table
test(1750.20, exists("notexist"), FALSE) # https://github.com/Rdatatable/data.table/issues/3055#issuecomment-423364960
test(1750.21, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","notexist"), sets=list(c("color"), character()), id=TRUE), error = "object 'notexist' not found")
test(1750.22, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color"), "stat"), id=TRUE), error = "Columns used in 'sets' but not present in 'by': [stat]")
test(1750.23, groupingsets(dt, j = .(a=sum(notexist)), by = c("color","year","status"), sets=list(c("color"), character()), id=TRUE), error = "object 'notexist' not found")
# update by ref `:=` forbidden
test(1750.24,
groupingsets(dt, j = sum_value := sum(value), by = c("color","year","status"), sets=list(c("color"), character())),
error = "Expression passed to grouping sets function must not update by reference."
)
# rollup
sets = local({
by=c("color","year","status")
lapply(length(by):0, function(i) by[0:i])
})
test(1750.31,
rollup(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), id=TRUE),
groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE)
)
sets = local({
by=c("year","status")
lapply(length(by):0, function(i) by[0:i])
})
test(1750.32,
rollup(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), id=TRUE),
groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), sets=sets, id=TRUE)
)
# cube
test(1750.33,
cube(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), id=TRUE),
groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"),
sets = list(c("color", "year", "status"),
c("color", "year"),
c("color", "status"),
"color",
c("year", "status"),
"year",
"status",
character(0)),
id = TRUE)
)
test(1750.34,
cube(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"), id=TRUE),
groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("year","status"), .SDcols=c("amount","value"),
sets = list(c("year","status"),
"year",
"status",
character(0)),
id = TRUE)
)
# grouping sets with integer64
if (test_bit64) {
set.seed(26)
dt[, c("id1","id2") := list(as.integer64(sample(sample(n, n/4), n, TRUE)), as.integer64(sample(sample(n, n/2), n, TRUE)))]
# int64 as grouping cols
r = groupingsets(dt, j = lapply(.SD, sum), by = c("color","id1","id2"), sets=list(c("color","id1"), c("color","id1","id2"), "id2", c("id1","id2"), "color", character()), .SDcols=c("amount","value"), id=TRUE)
test(1750.41, # grand total
r[grouping==7L, .(color, id1, id2, amount, value)],
dt[, lapply(.SD, sum), .(color=rep(NA_character_, n), id1=as.integer64(rep(NA,n)), id2=as.integer64(rep(NA,n))), .SDcols=c("amount","value")]
)
test(1750.42, # by color
r[grouping==3L, .(color, id1, id2, amount, value)],
dt[, lapply(.SD, sum), .(color, id1=as.integer64(rep(NA,n)), id2=as.integer64(rep(NA,n))), .SDcols=c("amount","value")]
)
test(1750.43, # by id2
r[grouping==6L, .(color, id1, id2, amount, value)],
dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), id1=as.integer64(rep(NA,n)), id2), .SDcols=c("amount","value")]
)
test(1750.44, # by id1, id2
r[grouping==4L, .(color, id1, id2, amount, value)],
dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), id1, id2), .SDcols=c("amount","value")]
)
# int64 as measure cols
r = groupingsets(dt, j = lapply(.SD, sum), by = c("color","status"), sets=list(c("color","status"), "status", character()), .SDcols=c("amount","value","id1","id2"), id=TRUE)
test(1750.45, # grand total. # was affected by as.factor(NA) in R 2.15.0
r[grouping==3L, .(color, status, amount, value, id1, id2)],
dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), status=as.factor(rep(NA_character_,n))), .SDcols=c("amount","value","id1","id2")]
)
test(1750.46, # by status
r[grouping==2L, .(color, status, amount, value, id1, id2)],
dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), status), .SDcols=c("amount","value","id1","id2")]
)
# int64 as grouping and measure cols
r = groupingsets(dt, j = lapply(.SD, sum), by = c("color","id1"), sets=list(c("color","id1"), "id1", character()), .SDcols=c("amount","value","id2"), id=TRUE)
test(1750.47, # grand total
r[grouping==3L, .(color, id1, amount, value, id2)],
dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), id1=as.integer64(rep(NA,n))), .SDcols=c("amount","value","id2")]
)
test(1750.48, # by id1
r[grouping==2L, .(color, id1, amount, value, id2)],
dt[, lapply(.SD, sum), .(color=rep(NA_character_,n), id1), .SDcols=c("amount","value","id2")]
)
}
# end Grouping Sets
# for completeness, added test for NA problem to close #1837.
DT = data.table(x=NA)
test(1751.1, capture.output(fwrite(DT, verbose=FALSE)), c("x",""))
test(1751.2, capture.output(fwrite(DT,na="",verbose=FALSE)), c("x",""))
test(1751.3, capture.output(fwrite(DT,na="NA",verbose=FALSE)), c("\"x\"","NA"))
test(1751.4, fread({fwrite(DT, f<-tempfile());f}), DT) # the important thing
unlink(f)
if (test_nanotime) {
old = options(warnPartialMatchArgs=FALSE) # option off temporarily pending https://github.com/eddelbuettel/nanotime/pull/49
DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z",
"2016-09-29T23:59:00.000000001Z",
"2016-09-29T23:59:00.000000999Z",
"1970-01-01T00:01:01.000001000Z",
"1970-01-01T00:00:00.000000000Z",
"1969-12-31T23:59:59.999999999Z",
"1969-12-31T23:59:59.000000089Z",
"1969-12-31T12:13:14.000000000Z",
"1969-12-31T12:13:14.999999999Z",
"1969-12-31T12:13:14.000000001Z",
"1967-03-15T00:00:00.300000002Z",
"1967-03-15T23:59:59.300000002Z")))
options(old)
test(1752, capture.output(fwrite(DT, verbose=FALSE))[-1], tt)
}
# check too many fields error from ,\n line ending highlighted in #2044
test(1753.1, fread("X,Y\n1,2\n3,4\n5,6"), data.table(X=INT(1,3,5),Y=INT(2,4,6)))
test(1753.2, fread("X,Y\n1,2\n3,4,\n5,6",logical01=TRUE), ans<-data.table(X=TRUE,Y=2L), warning="Stopped.*line 3. Expected 2 fields but found 3.*discarded.*<<3,4,>>")
test(1753.3, fread("X,Y\n1,2\n3,4,7\n5,6",logical01=TRUE), ans, warning="Stopped.*line 3. Expected 2 fields but found 3.*discarded.*<<3,4,7>>")
# issue 2051 where a quoted field contains ", New quote rule detection handles it.
if (test_R.utils) test(1753.4, fread(testDir("issue_2051.csv.gz"))[2,grep("^Our.*tool$",COLUMN50)], 1L)
# check omp critical around SET_STRING_ELT
# minimal construction big enough for parallelism with 8 or less threads. On a machine with more, do setDTthreads(8) first otherwise
# it'll switch back to single threaded. In dev I have 8 threads and on CRAN there are 2 which will run this test nicely.
# Without the critical in fread.c, this test will crash R consistently which is correct (tested).
# Needs to be run in a fresh R session, otherwise it will work just because these strings have been seen before and added to
# the global character cache already.
# Created with:
# x = unlist(outer(outer(LETTERS,LETTERS,paste0),LETTERS,paste0))
# dt = as.data.table(matrix(x, ncol=2))
# setnames(dt,paste0("col",1:2))
# fwrite(dt,"allchar.csv")
# fwrite(dt,"allchar.csv",append=TRUE)
if (test_R.utils) test(1754, fread(testDir("allchar.csv.gz"))[c(1,2,17575,17576),col2], c("AAN","BAN","YZZ","ZZZ"))
# unescaped embedded quotes from here: http://stackoverflow.com/questions/42939866/fread-multiple-separators-in-a-string
test(1755, fread(testDir("unescaped.csv"), logical01=TRUE),
data.table(No =c(FALSE,TRUE),
Comment=c('he said:"wonderful."', 'The problem is: reading table, and also "a problem, yes." keep going on.'),
Type =c('A','A')),
warning="resolved improper quoting" )
# test duplicated colClasses
txt = "A,B,C,D\n1,3,5,7\n2,4,6,8\n"
test(1756.1, fread(txt), data.table(A=1:2, B=3:4, C=5:6, D=7:8))
test(1756.2, fread(txt, colClasses=list('numeric'=c(1,3))), ans<-data.table(A=as.double(1:2), B=3:4, C=as.double(5:6), D=7:8))
test(1756.3, fread(txt, colClasses=list('numeric'=c(1,3,1))), ans, warning="Column 1 ('A') appears more than once in colClasses. The second time is colClasses[[1]][3]")
test(1756.4, fread(txt, colClasses=list('numeric'=c(1,3),'character'=2)), ans<-data.table(A=as.double(1:2), B=c("3","4"), C=as.double(5:6), D=7:8))
test(1756.5, fread(txt, colClasses=list('numeric'=c(1,3),'character'=2:3)), ans, warning="Column 3 ('C') appears more than once in colClasses. The second time is colClasses[[2]][2]")
# Windows \r\n line endings when using multiple threads and detecting type within quoted fields, #2087
if (test_R.utils) {
test(1757, fread(testDir("winallquoted.csv.bz2"))[c(1,2,4998,4999)],
data.table(station_id=2L, bikes_available=c(2L,2L,11L,11L), docks_available=c(25L,25L,16L,16L),
time=c("2013/08/29 12:06:01","2013/08/29 12:07:01","2013/09/02 08:48:01","2013/09/02 08:50:01")))
}
test(1758, sapply(fread("A,B\n,"),class), c(A="logical",B="logical"))
# Slowdown when parallel and all columns are character, #2091. This test tests the data loads ok but we need a
# performance test environment to make sure the slowdown doesn't come back. Fix was to delay call to SET_.
if (test_R.utils) test(1759, fread(testDir("alluniquechar.csv.gz"))[c(1,2,499,500)],
data.table(A=c("jptokakysooopwtmlkeimzbgpeinhy","bchguwmynjhecsxpxldyzlemavmwvz",
"avlyclruzkazfqhyxnppaafwcveolb","dkmyfqhltlwzwwxyvshwrzrdmfyqdm"),
B=c("jptokakysooopwtmlkei","bchguwmynjhecsxpxldy","avlyclruzkazfqhyxnpp","dkmyfqhltlwzwwxyvshw"),
C=c("kakysooopwt","uwmynjhecsx","clruzkazfqh","fqhltlwzwwx"),
D=c("pt","ch","vl","km"),
E=c("i","y","p","w"),
F=c("kyso","mynj","ruzk","hltl"),
G=c("ptokakysooopwtmlkeimz","chguwmynjhecsxpxldyzl","vlyclruzkazfqhyxnppaa","kmyfqhltlwzwwxyvshwrz"),
H=c("tokakysooopwtmlkeimzbgpein","hguwmynjhecsxpxldyzlemavmw",
"lyclruzkazfqhyxnppaafwcveo","myfqhltlwzwwxyvshwrzrdmfyq")))
# fread should use multiple threads on single column input.
# tests 2 threads; the very reasonable limit on CRAN
# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently)
if (getDTthreads() == 1L) {
cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n")
} else {
N = if (TRUE) 2e6 else 1e9 # offline speed check
fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile())
test(1760.1, file.info(f)$size > 4*1024*1024)
test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads")
unlink(f)
}
# fread single column with superfluous fill=TRUE, #2118
test(1761.1, fread("1\n2\n3", fill=TRUE), data.table(V1=1:3))
test(1761.2, fread("1\n2\n3", fill=FALSE), data.table(V1=1:3))
# non-error with non-empty empty j, #2142
DT = data.table(a = 1:5)
test(1762, DT[ , {}], NULL)
# rbindlist empty items segfault, #2019
x = list(list(a = 1), list(), list(a = 2))
ans = data.table(id=c(1L,3L),a=c(1,2))
for (i in 1:100) test(1763+i/1000, rbindlist(x, idcol="id"), ans)
# as.ITime(character(0)) used to fail, #2032
test(1764.1, format(as.ITime(character(0))), character(0))
# Edge case from #2171
test(1764.2, format(structure(NA_integer_, class = "ITime")), NA_character_)
# IDateTime error when tzone is NULL, #1973
x = as.POSIXct('2017-03-17', tz="UTC")
attr(x, 'tzone') = NULL
test(1765.1, print(IDateTime(x)), output=".*idate.*itime.*1: 2017-03-1[67]",
ignore.warning="timedatectl") # R 3.4's Sys.timezone() raises this warning in docker, #4182
# test test's ignore.warning
test(1765.2, {warning("foo"); 4L}, 4L, ignore.warning="foo")
test(1765.3, {warning("foo"); 4L}, 4L, ignore.warning="Foo", warning="foo")
test(1765.4, {warning("foobar1"); warning("foobar2"); warning("FOO"); 4L}, 4L, ignore.warning="bar", warning="FOO")
test(1765.5, {warning("foobar1"); warning("foobar2"); warning("FOO"); 4L}, 4L, ignore.warning="2", warning=c("foobar1","FOO"))
# print(null.data.table()) should not output NULL as well, #1852
# use capture.output() in this case rather than output= to ensure NULL is not output
test(1766, capture.output(print(data.table(NULL))), "Null data.table (0 rows and 0 cols)")
# Bug on subset of 1-row data.table when expr returns a named logical vector #2152
options(datatable.auto.index=FALSE)
dt = data.table(x=1, y="a")
val = c(foo=1L)
test(1767, dt[x == val], data.table(x=1, y="a"))
options(datatable.auto.index=TRUE)
# fread sampling overlap and reaching end early on small files with varying line lengths, #2157
if (test_R.utils) {
test(1768, fread(file=testDir("issue_2157_sampling_overlap.txt.gz"))[,c("X1","X2","X7","X8")],
output="X1 X2.*1: ABCD021917 NA.*678.0000.*2:.*1314: ABCD032617.*732.9818")
test(1769, fread(file=testDir("issue_2157_sampling_reached_eof_early.txt.bz2"))[,c("X1","X2","X10","X11")],
output="X1 X2.*1:.*6.00.*2: 2005-08-15.*1228: 2017-05-10 0 -112186.00 500")
}
# Test unbounded strstr() (in particular when fileSize is exactly 64k on Windows), #2201
ff = file(f<-tempfile(), open="wb")
line255 = paste(rep("123,456,789,0AB",16),collapse=",")
test(1770.1, nchar(line255), 255L)
for (i in 1:256) cat(line255, "\x0A", sep="", file=ff) # use \n line ending even on Windows
close(ff)
test(1770.2, file.info(f)$size, 65536)
test(1770.3, fread(f)[256,V64], "0AB")
test(1770.4, fread(f, skip="spam"), error="not found in input")
unlink(f)
# Now without final newline but still exactly 65536 in size :
ff = file(f<-tempfile(), open="wb")
for (i in 1:255) cat(line255, "\x0A", sep="", file=ff)
cat(line255, "C", sep="", file=ff)
close(ff)
test(1770.5, file.info(f)$size, 65536)
test(1770.6, fread(f)[256,V64], "0ABC")
test(1770.7, fread(f, skip="spam"), error="not found in input")
unlink(f)
# CJ retains attributes and classes, #2029, PR#2150
l <- list(a = as.POSIXct(c("2016-01-01", "2017-01-01"), tz = "UTC"),
b = as.POSIXct(c("2016-01-01", "2017-01-01")),
c = as.Date("2015-01-01"), ## according to comment about CJ loosing date class
d = factor(c("a", "b", "c"), ordered = TRUE), ## according to comment about bug with ordered factors
e = factor(c("a", "b", "c"), ordered = FALSE),
f = c(1,2),
g = c("a", "b"),
h = c(TRUE, FALSE))
setattr(l$g, "test", "testval")## add hand-made attribute
test(1771.1, lapply(l, attributes), lapply(do.call(CJ, l), attributes))
test(1771.2, lapply(l, class), lapply(do.call(CJ, l), class))
l <- list(a = factor(c("a", "b", "c"), ordered = TRUE),
b = as.POSIXct(c("2016-01-01", "2017-01-01")),
c = as.Date("2015-01-01"),
d = factor(c("a", "b", "c"), ordered = TRUE),
e = as.POSIXct(c("2016-01-01", "2017-01-01"), tz = "UTC"),
f = c(1,2),
g = c("a", "b"),
h = c(TRUE, FALSE))
test(1771.3, lapply(l, attributes), lapply(do.call(CJ, l), attributes))
test(1771.4, lapply(l, class), lapply(do.call(CJ, l), class))
l <- list(a = factor(c("a", "b", "c"), ordered = TRUE),
b = as.POSIXct(c("2016-01-01", "2017-01-01")),
c = as.Date("2015-01-01"),
d = factor(c("a", "b", "c"), ordered = TRUE),
e = c(TRUE, FALSE),
f = c(1,2),
g = c("a", "b"),
h = as.POSIXct(c("2016-01-01", "2017-01-01"), tz = "UTC"))
test(1771.5, lapply(l, attributes), lapply(do.call(CJ, l), attributes))
test(1771.6, lapply(l, class), lapply(do.call(CJ, l), class))
l <- list(a = NA,
c = c(1,2),
d = as.POSIXct("2016-01-01", tz = "UTC"))
test(1771.7, lapply(l, attributes), lapply(do.call(CJ, l), attributes))
test(1771.8, lapply(l, class), lapply(do.call(CJ, l), class))
# split.data.table should respect non-alphabetic order if passed a factor in by, #2082
DT = data.table(a = factor(c('a', 'b', 'b', 'a'), levels = c('b', 'a')),
b = c(2, 2, 1, 1), c = 1:4)
test(1772.1, split(DT, by = 'a', sorted = TRUE),
list(b = data.table(a = structure(c(1L, 1L), .Label = c("b", "a"), class = "factor"),
b = c(2, 1), c = 2:3),
a = data.table(a = structure(c(2L, 2L), .Label = c("b", "a"), class = "factor"),
b = c(2, 1), c = c(1L, 4L))))
test(1772.2, split(DT, by = c('a', 'b'), sorted = TRUE),
list(b.1 = data.table(a = structure(1L, .Label = c("b", "a"), class = "factor"), b = 1, c = 3L),
b.2 = data.table(a = structure(1L, .Label = c("b", "a"), class = "factor"), b = 2, c = 2L),
a.1 = data.table(a = structure(2L, .Label = c("b", "a"), class = "factor"), b = 1, c = 4L),
a.2 = data.table(a = structure(2L, .Label = c("b", "a"), class = "factor"), b = 2, c = 1L)))
# More helpful error message when using a single symbol name for a logical column to subset, #1844
# Used to be just 'not found'
if (exists("A")) rm(A)
if (exists("B")) rm(B)
if (exists("NOTEXIST")) rm(NOTEXIST)
if (exists("MyCol")) rm(MyCol)
DT <- data.table(A = c(FALSE, TRUE), B = 2:1, C=c(2,3), MyCol=c(2,2))
test(1773.01, DT[A], error = "A is not found in calling scope but it is a column of type logical.*==TRUE.*When the first argument")
test(1773.02, DT[B], error = "B is not found in calling scope but it is a column of type integer.*DT\\[\\(col\\)\\].*When the first argument") # 697
test(1773.03, DT[C], error = "i has evaluated to type closure. Expecting logical, integer or double") # C picks up stats::C in calling scope
test(1773.04, DT[MyCol], error="MyCol is not found in calling scope but it is a column of type double.*DT\\[\\(col\\)\\].*When the first argument")
test(1773.05, DT[NOTEXIST], error = "NOTEXIST is not found in calling scope and it is not a column name either. When the first argument")
test(1773.06, DT[(A)], DT[2])
test(1773.07, DT[A==TRUE], DT[2])
test(1773.08, DT[(B)], data.table(A=c(TRUE,FALSE), B=1:2, C=c(3,2), MyCol=2))
test(1773.09, DT[(MyCol)], data.table(A=c(TRUE,TRUE), B=INT(1,1), C=c(3,3), MyCol=2))
test(1773.10, DT[(C)], data.table(A=c(TRUE,NA), B=c(1L,NA), C=c(3,NA), MyCol=c(2,NA)))
# New as.data.table.array method in v1.10.5
set.seed(1L)
ar.dimnames = list(color = sort(c("green","yellow","red")),
year = as.character(2011:2015),
status = sort(c("active","inactive","archived","removed")))
ar.dim = sapply(ar.dimnames, length)
ar = array(sample(c(rep(NA, 4), 4:7/2), prod(ar.dim), TRUE),
unname(ar.dim), # array() having length(dims) < 3 will be created as matrix in R so will not be dispatched here but as.data.table.matrix
ar.dimnames)
dt = as.data.table(ar, na.rm=FALSE)
dimcols = head(names(dt), -1L)
test(1774.01, TRUE, all(
nrow(dt) == 60L,
prod(sapply(ar.dimnames, length)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
dt[is.na(value), .N] == 30L,
dt[, .N==1L, c(dimcols)]$V1
))
dt = as.data.table(ar)
dimcols = head(names(dt), -1L)
test(1774.02, TRUE, all(
nrow(dt) == 30L,
prod(sapply(ar.dimnames, length)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
dt[is.na(value), .N] == 0L,
dt[, .N==1L, c(dimcols)]$V1
))
# 4D unnamed
x = array(1:81, dim=rep(3L,4))
dt = as.data.table(x, na.rm=FALSE)
test(1774.03, all(
identical(dim(dt), c(81L,5L)),
identical(names(dt), c(paste0("V",1:4),"value")),
all(dt[J(1L)][1L, value] == 1L, dt[J(2L)][1L, value] == 2L, dt[J(3L)][.N, value] == 81L) # this also tests if dt is keyed
))
# 4D named dim values but not dims
x = array(1:81, dim=rep(3L, 4L), dimnames=rep(list(letters[1:3]), 4L))
dt = as.data.table(x, na.rm=FALSE)
test(1774.04, all(
identical(dim(dt), c(81L,5L)),
identical(names(dt), c(paste0("V",1:4),"value")),
all(dt[J("a")][1L, value] == 1L, dt[J("b")][1L, value] == 2L, dt[J("c")][.N, value] == 81L)
))
# 4D named dim values and dims
x = array(1:81, dim=rep(3L, 4L), dimnames=setNames(rep(list(letters[1:3]), 4L), letters[1:4]))
dt = as.data.table(x, na.rm=FALSE)
test(1774.05, all(
identical(dim(dt), c(81L,5L)),
identical(names(dt), c(letters[1:4],"value")),
all(dt[J("a")][1L, value] == 1L, dt[J("b")][1L, value] == 2L, dt[J("c")][.N, value] == 81L)
))
# third dim of length 1L so really 2D
x = array(1:4, dim=c(2L,2L,1L), dimnames=list(a=letters[1:2], b=letters[1:2], c="a"))
dt = as.data.table(x, na.rm=FALSE)
test(1774.06, all(
identical(dim(dt), c(4L,4L)),
identical(names(dt), c("a","b","c","value")),
all(dt[J("a")][, value] == c(1L,3L), dt[J("b")][, value] == c(2L,4L))
))
# second and third dim of length 1L so really 1D
x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", c="a"))
dt = as.data.table(x, na.rm=FALSE)
test(1774.07, all(
identical(dim(dt), c(2L,4L)),
identical(names(dt), c("a","b","c","value")),
all(dt[J("a")][, value] == 1L, dt[J("b")][, value] == 2L)
))
# 3x3x3 na.rm=FALSE / sorted=TRUE
set.seed(2)
x = rnorm(27)
x[sample(length(x), length(x)/2)] = NA
dim(x) = c(3L,3L,3L)
dt = as.data.table(x, na.rm=FALSE)
test(1774.08, all(
identical(dim(dt), c(27L,4L)),
identical(names(dt), c(paste0("V",1:3),"value")),
dt[is.na(value), .N] > 0L,
length(key(dt)) > 0L
))
# na.rm=TRUE / sorted=TRUE
dt = as.data.table(x)
test(1774.09, all(
identical(dim(dt), c(14L,4L)),
identical(names(dt), c(paste0("V",1:3),"value")),
dt[is.na(value), .N] == 0L,
length(key(dt)) > 0L
))
# na.rm=TRUE / sorted=FALSE
dt = as.data.table(x, sorted=FALSE)
test(1774.10, all(
identical(dim(dt), c(14L,4L)),
identical(names(dt), c(paste0("V",1:3),"value")),
dt[is.na(value), .N] == 0L,
is.unsorted(dt[[1]]),
is.null(key(dt))
))
# na.rm=FALSE / sorted=FALSE
dt = as.data.table(x, na.rm=FALSE, sorted=FALSE)
test(1774.11, all(
identical(dim(dt), c(27L,4L)),
identical(names(dt), c(paste0("V",1:3),"value")),
is.unsorted(dt[[1]]),
is.null(key(dt))
))
# expects error on value.name overlapping with column names in result (dimension names)
x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", c="a"))
test(1774.12, as.data.table(x, value.name="a"), error = "Argument 'value.name' should not overlap with column names in result")
x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", value="a"))
test(1774.13, as.data.table(x), error = "Argument 'value.name' should not overlap with column names in result")
## unsupported usage of as.data.table.array
test(1774.14, as.data.table.array(as.matrix(x)), error="method should only be called for arrays with 3+")
test(1774.15, as.data.table(x, value.name=NA), error="'value.name' must be scalar")
test(1774.16, as.data.table(x, sorted='a'), error="'sorted' must be scalar")
test(1774.17, as.data.table(x, na.rm='a'), error="'na.rm' must be scalar")
# verify print.keys works
DT1 <- data.table(a = 1:3, key = "a")
test(1775.1, capture.output(print(DT1, print.keys = TRUE)),
c("Key: <a>", " a", "1: 1", "2: 2", "3: 3"))
DT2 <- data.table(a = 1:3, b = 4:6)
setindexv(DT2, c("b","a"))
test(1775.2, capture.output(print(DT2, print.keys = TRUE)),
c("Index: <b__a>", " a b", "1: 1 4", "2: 2 5", "3: 3 6"))
setindexv(DT2, "b")
test(1775.3, capture.output(print(DT2, print.keys = TRUE)),
c("Indices: <b__a>, <b>", " a b", "1: 1 4", "2: 2 5", "3: 3 6"))
setkey(DT2, a)
setindexv(DT2, c("b","a"))
test(1775.4, capture.output(print(DT2, print.keys = TRUE)),
c("Key: <a>", "Index: <b__a>", " a b", "1: 1 4", "2: 2 5", "3: 3 6"))
# dev regression #2285
cat("A B C\n1 2 3\n4 5 6", file=f<-tempfile())
test(1776.1, fread(f), data.table(A=c(1L,4L), B=c(2L,5L), C=c(3L,6L)))
unlink(f)
cat("A,B,C\n1,2,3\n4,5,", file=f<-tempfile())
test(1776.2, fread(f), data.table(A=c(1L,4L), B=c(2L,5L), C=c(3L,NA)))
unlink(f)
txt = '"b","bc8d5",\n"c",,"2f685"\n"d",,\n,"cdfb9",\n'
cat(txt, file=f<-tempfile())
test(1776.3, fread(f), fread(txt))
unlink(f)
# column name detection when some columns are empty, #2370
test(1777.01, fread(",A,B\n1,3,5\n2,4,6\n"), data.table(V1=1:2, A=3:4, B=5:6))
test(1777.02, fread("A,,B\n1,3,5\n2,4,6\n"), data.table(A=1:2, V2=3:4, B=5:6))
test(1777.03, fread("A,B,\n1,3,5\n2,4,6\n"), data.table(A=1:2, B=3:4, V3=5:6))
test(1777.04, fread(",A,\n1,3,5\n2,4,6\n"), data.table(V1=1:2, A=3:4, V3=5:6))
test(1777.05, fread("A,,\n1,3,5\n2,4,6\n"), data.table(A=1:2, V2=3:4, V3=5:6))
test(1777.06, fread(",,A\n1,3,5\n2,4,6\n"), data.table(V1=1:2, V2=3:4, A=5:6))
test(1777.07, fread(",9,A\n1,3,5\n2,4,6\n"), data.table(V1=1:2, "9"=3:4, A=5:6))
test(1777.08, fread(",A,9\n1,3,5\n2,4,6\n"), data.table(V1=1:2, A=3:4, "9"=5:6))
test(1777.09, fread(",7,9\n1,3,5\n2,4,6\n"), data.table(V1=c(NA,1:2), V2=c(7L,3:4), V3=c(9L,5:6)))
# we skip test numbers .10, .20 etc because they print as .1 and .2
test(1777.10, fread(",,\n1,3,5\n2,4,6\n"), data.table(V1=c(NA,1:2), V2=c(NA,3:4), V3=c(NA,5:6)))
test(1777.11, fread(",A,B\n1,3,5\n2,4,6\n", logical01=FALSE), data.table(V1=1:2, A=3:4, B=5:6)) # logical01 is included to catch a prior bug despite there being no logical columns
test(1777.12, fread(",A,B\n1,3,5\n2,4,6\n", header=TRUE), data.table(V1=1:2, A=3:4, B=5:6))
test(1777.13, fread(",A,B\n1,3,5\n2,4,6\n", header=FALSE), data.table(V1=c(NA,1:2), V2=c("A",3:4), V3=c("B",5:6)))
test(1777.14, fread("A,B,C\n", verbose=TRUE), data.table(A=logical(),B=logical(),C=logical()),
output="because there are no number fields in the first and only row")
test(1777.15, fread("A,B,3\n", verbose=TRUE), data.table(V1="A",V2="B",V3=3L),
output="because there are number fields in the first and only row")
test(1777.16, fread("A,B,3\nC,D,4\n", verbose=TRUE), data.table(V1=c("A","C"),V2=c("B","D"),V3=3:4),
output="because there are some number columns and those columns do not have a string field at the top of them")
test(1777.17, fread("A,B,C\nD,E,F\n", verbose=TRUE), data.table(A="D", B="E", C="F"),
output="because all columns are type string and a better guess is not possible")
test(1777.18, fread("A,B,C\nC,D,4\n", verbose=TRUE), data.table(A="C",B="D",C=4L),
output="due to column 3 containing a string on row 1 and a lower type.*in the rest of the.*sample rows")
# unquoted fields containing \r, #2371
test(1778.1, fread("A,B,C\n0,,\n1,hello\rworld,2\n3,test,4\n", verbose=TRUE),
DT <- data.table(A=c(0L,1L,3L), B=c("","hello\rworld","test"), C=c(NA,2L,4L)),
output="has been found.*common and ideal")
fwrite(DT, f<-tempfile())
test(1778.2, readLines(f), c("A,B,C", "0,\"\",", "1,\"hello", "world\",2", "3,test,4"))
# fwrite quotes the field containing \r ........... ^^ ............ ^^
# and that reading back in gets us back to DT faithfully
test(1778.3, fread(f), DT)
tt = setDT(read.csv(f, stringsAsFactors=FALSE))
tt[2, B:=gsub("\n","\r",B)] # base R changes the \r to a \n, so restore that
test(1778.4, tt, DT)
unlink(f)
# #1392 IDate ITime new methods for faster conversion
# conversion in-out match for UTC
same = list(l = as.POSIXlt("2015-10-12 13:19:35", tz = "UTC"))
same$p = as.POSIXct(same$l)
same$d = as.Date(same$p)
same$n = as.numeric(same$d)
same$i = as.integer(same$d)
ld = sapply(same, as.IDate)
test(1779.01, uniqueN(ld)==1L)
lt = sapply(same[1:2], as.ITime) # exclude date
test(1779.02, uniqueN(lt)==1L)
# some random 1e6 timestamps old defaults vs new methods UTC
intpx = function(x) as.integer(as.POSIXct(x, origin = "1970-01-01", tz = "UTC"))
set.seed(1)
i = sample(intpx("2015-10-12")-intpx("2014-10-12"), 1e5, TRUE) + intpx("2014-10-12")
p = as.POSIXct(i, origin = "1970-01-01", tz = "UTC")
test(1779.03, identical(as.ITime.default(p), as.ITime(p)))
test(1779.04, identical(as.IDate.default(p), as.IDate(p)))
# test for non-UTC
p = as.POSIXct(i, origin = "1970-01-01", tz = "Asia/Hong_Kong")
test(1779.05, identical(as.ITime.default(p), as.ITime(p)))
test(1779.06, identical(as.IDate.default(p), as.IDate(p)))
p = as.POSIXct(i, origin = "1970-01-01", tz = "America/New_York")
test(1779.07, identical(as.ITime.default(p), as.ITime(p)))
test(1779.08, identical(as.IDate.default(p), as.IDate(p)))
# R 3.0.1 had the following bug fix in R News :
# " as.POSIXct.numeric was coercing origin using the tz argument and not "GMT" as documented (PR#14973) "
# So tz="UTC" is required on next line for test 1779.9 to pass R 3.0.0 (current stated dependency).
# Test 1779.09 would be fine without the tz="UTC" from R 3.0.1 onwards.
p = as.POSIXct(i, origin = "1970-01-01", tz="UTC")
test(1779.09, identical(as.ITime.default(p), as.ITime(p)))
test(1779.10, identical(as.IDate.default(p), as.IDate(p)))
test(1779.11, as.IDate("20170929", "%Y%m%d"), as.IDate("20170929", format="%Y%m%d")) # 2453
test(1779.12, as.IDate(1), as.IDate("1970-01-02")) # 2446
test(1779.13, as.IDate(1L), as.IDate("1970-01-02"))
##########################
test(1800.1, fread("A\n6e55693457e549ecfce0\n"), data.table(A=c("6e55693457e549ecfce0")))
test(1800.2, fread("A\n1e55555555\n-1e+234056\n2e-59745"), data.table(A=c("1e55555555","-1e+234056","2e-59745")))
#
# Tests thanks to Pasha copied verbatim from his PR#2200
#
# Test files with "round" sizes (different multiples of 2, from 512B to 64KB)
for (mul in c(16, 128, 512, 1024, 2048)) {
ff = file(f<-tempfile(), open="wb")
cat(paste(rep("1234,5678,9012,3456,7890,abcd,4\x0A", mul), collapse=""), file=ff)
close(ff)
DT = data.table(V1=rep(1234L, mul), V2=5678L, V3=9012L, V4=3456L, V5=7890L, V6="abcd", V7=4L)
test(1801 + log2(mul)/100 + 0.001, file.info(f)$size, mul*32)
test(1801 + log2(mul)/100 + 0.002, fread(f), DT)
}
# Test without the newline
ff = file(f<-tempfile(), open="wb")
cat(paste("1", paste(rep("1234,5678,9012,3456,7890,zyxw,4", 128), collapse="\x0A"), sep=""), file=ff)
close(ff)
test(1803.1, file.info(f)$size, 4096)
DT = data.table(V1=rep(1234L, 128), V2=5678L, V3=9012L, V4=3456L, V5=7890L, V6="zyxw", V7=4L)
DT[1, 1] = 11234L
test(1803.2, fread(f), DT)
test(1804.1, fread("A,B\n", na.strings=" NA"), error="NAstring .* has whitespace at the beginning or end")
test(1804.2, fread("A,B\n", na.strings="NA\t"), error="NAstring .* has whitespace at the beginning or end")
test(1805, fread("A\n", verbose=TRUE), data.table(A=logical()), output="text input (not a filename)")
test(1806, fread('"A,B,C\n1,2,3\n4,5,6'), data.table('"A'=c(1L,4L), B=c(2L,5L), C=c(3L,6L)), warning="resolved improper quoting")
test(1807, fread('A,B,"C\nD",E'), data.table(A=logical(), B=logical(), "C\nD"=logical(), E=logical()))
test(1808.1, fread("A,B\r1,2\r3,4"), data.table(A=c(1L,3L),B=c(2L,4L)))
test(1808.2, fread("A,B\r1,2\r3,4\r"), data.table(A=c(1L,3L),B=c(2L,4L)))
cat("A,B\r1,2\r3,4",file=f<-tempfile())
test(1808.3, fread(f), data.table(A=c(1L,3L),B=c(2L,4L)))
unlink(f)
test(1808.4, fread("A,B\r1,3\r\r\r2,4\r", logical01=TRUE), data.table(A=TRUE, B=3L), warning="Discarded single-line footer: <<2,4>>")
test(1808.5, fread("A,B\r4,3\r\r \r2,4\r"), data.table(A=4L, B=3L), warning="Discarded single-line footer: <<2,4>>")
test(1808.6, fread("A,B\r1,3\r\r \r2,4\r", blank.lines.skip=TRUE), data.table(A=1:2, B=3:4))
test(1808.7, fread("A,B\r1,3\r\r \r2,4\r", fill=TRUE), data.table(A=c(1L,NA,NA,2L), B=c(3L,NA,NA,4L)))
test(1808.8, fread("A,B\r1,3\r\r \r2,\r", blank.lines.skip=TRUE, fill=TRUE), data.table(A=1:2, B=c(3L,NA)))
test(1809, fread("A,B\n\r1,2\n\r3,4\n\r5,6"), data.table(A=c(1L,3L,5L), B=c(2L,4L,6L)))
cat("A,B\n1,q\n2,w\n3,xyz", file=f<-tempfile()); test(1810, fread(f,verbose=TRUE), data.table(A=c(1L,2L,3L), B=c("q","w","xyz")), output="File ends abruptly with 'z'.*cow page"); unlink(f)
test(1811, fread("A,B\n1,2\n3,4", skip="boo"), error="skip='boo' not found in input")
test(1812, fread("A,B\n1,2\n3,4\n", skip="4", verbose=TRUE), data.table(V1=3L, V2=4L), output="Found skip='4' on line 3")
test(1813, fread("A,B\n1,2\n3,4", skip=10L), error="skip=10 but the input only has 3 lines")
test(1814, fread("A,B\n1,2\n3,4\n \n\t", skip=3L), error="skip has been set after the last non-whitespace")
DT = data.table(A=seq(1, 1000000), B="x", C=TRUE)
fwrite(DT, f<-tempfile())
test(1815, fread(f, nrows=5), DT[1:5]) #2243
test(1816.1, fread("A,E\n1,2\n5,7\n4,6\n\x1A\x1A", verbose=TRUE),
data.table(A=c(1L, 5L, 4L), E=c(2L, 7L, 6L)),
output="Last byte.*0x1A.*Ctrl.Z.*removed") #1612
ff = file(f <- tempfile())
open(ff, "wb")
cat("A,B\n5,2", file=ff)
writeBin(as.raw(c(0x00, 0x00, 0x00)), ff)
close(ff)
test(1816.2, fread(f, verbose=TRUE),
data.table(A=c(5L), B=c(2L)),
output="Last byte.*0x00.*NUL.*removed") #1895
unlink(f)
if (test_R.utils) {
test(1817, fread(testDir("bad.txt.bz2"))[c(1,.N),c(1,3)], #2238, fileSize multiple of 4096
data.table("####################"=c("#############","#########"),
"###############################################"=c(0,0))) # not sure why this all-0 column is detectected as numeric rather than integer
test(1818, fread(testDir("session_aborted_fatal_error.txt.bz2"))[c(1,.N),c(1,2,250,251)], data.table(V1=c("ACSSF","ACSSF"),V2=c("2010m1","2010m1"),V250=-1L,V251=-1L))
}
# expansion of uses of as.ITime.character, PR#1796
test(1819, as.ITime("2015-09-29 08:22:00"), structure(30120L, class = "ITime"))
# Issue 2287: the % sign in the error/warning message should not be interpreted as a format string!
test(1820.1, fread("name,id\nfoo,2\nbar%\n"), data.table(name="foo", id=2L), warning="Discarded single-line footer: <<bar%>>")
test(1820.2, fread("name,id\nfoo,2\nbar%d"), data.table(name="foo", id=2L), warning="Discarded single-line footer: <<bar%d>>")
test(1820.3, fread("name,id\nfoo,2\nbar%s"), data.table(name="foo", id=2L), warning="Discarded single-line footer: <<bar%s>>")
# new argument for print.data.table: col.names
# issue #1482 / PR #1483
DT = data.table(a = 1:21, b = 22:42)
test(1821.1, sum(grepl("a.*b", capture.output(print(DT, col.names = "auto", nrows=30)))), 2L)
test(1821.2, sum(grepl("a.*b", capture.output(print(DT, col.names = "top")))), 1L)
x = capture.output(print(DT, col.names = "none", nrows=30))
test(1821.3, sum(grepl("a.*b", x)), 0L)
test(1821.4, length(x), nrow(DT))
test(1821.5, print(DT, col.names = "asdf"), error = "Valid options")
test(1821.6, capture.output(print(DT[1:5], col.names = "none", class = TRUE)),
c("1: 1 22", "2: 2 23", "3: 3 24", "4: 4 25", "5: 5 26"),
warning = "Column classes.*suppressed")
suppressWarnings(
x <- capture.output(print(DT, col.names = "none", class = TRUE))
)
test(1821.7, sum(grepl("<", x)), 0L)
# Issue 2299: snprintf(%zd / %zu) on Windows is not working
# The following example forces a wrong column count outside of the sampled rows
src = paste(c("A,B",
paste(rep("1,2", 100), collapse="\n"),
"999",
paste(rep("3,4", 10000), collapse="\n"),
""),
collapse="\n")
test(1822, fread(src), data.table(A=rep(1L,100L), B=2L), warning="Stopped early on line 102. Expected 2 fields but found 1.*discarded.*<<999>>")
# NB: The first sample jump uses the first 100 rows and just misses the 999. Since the data is large enough, the other jumps capture the type bump from 1 (bool) to 3 (int).
# Issue 2326: .SD mistakenly includes column being set when get() appears in j
DT <- data.table(x = seq(1, 10), y = seq(10, 1))
DT[, z := ncol(.SD) + y, .SDcols = "x"]
test(1823.1, DT$z, seq(11, 2))
DT[, z := ncol(.SD) + get("y"), .SDcols = "x"]
test(1823.2, DT$z, seq(11, 2))
DT[, z := ncol(.SD) + get("y"), .SDcols = c("x", "z")]
test(1823.3, DT$z, seq(12, 3))
DT[, x := {.SD; x + 1}, .SDcols = "y"]
test(1823.4, DT$x, as.double(seq(2, 11)))
DT[, z := ncol(.SD) + x, .SDcols = "y"]
test(1823.5, DT$z, as.double(seq(3, 12)))
DT[, z := ncol(.SD) + x, .SDcols = c("x", "y")]
test(1823.6, DT$z, as.double(seq(4, 13)))
# Issue 2250
test(1824, fread("A,B\n2,384325987234905827340958734572934\n"), data.table(A=2L, B="384325987234905827340958734572934"))
# Issue 2251
test(1825.01, fread('A,B\n"1","2"', colClasses = "integer"), data.table(A=1L, B=2L))
test(1825.02, fread('A,B\n"1","2"', colClasses = "character"), data.table(A="1", B="2"))
test(1825.03, fread('A,B\n"1","2"', colClasses = "numeric"), data.table(A=1.0, B=2.0))
test(1825.04, fread("A,B\n1,TRUE", colClasses=c("integer", "logical")), data.table(A=1L, B=TRUE)) # 2766
str <- "x1,x2,x3,x4,x5\n1,2,1.5,T,cc\n3,4,2.5,F,ff" # 2922
test(1825.05, fread(str), ans0<-data.table(x1=INT(1,3), x2=INT(2,4), x3=c(1.5,2.5), x4=c("T","F"), x5=c("cc","ff")))
test(1825.06, fread(str, colClasses=c("integer","numeric","numeric","integer","character")),
ans<-data.table(x1=INT(1,3), x2=c(2,4), x3=c(1.5,2.5), x4=c("T","F"), x5=c("cc","ff")),
warning="Attempt to override column 4 <<x4>> of inherent type 'string' down to 'int32' ignored. Only overrides to a higher type are currently supported.")
test(1825.07, fread(str, colClasses=c("integer","numeric","numeric","logical","character")), ans, warning="Attempt.*column 4.*'string'.*'bool8' ignored.")
test(1825.08, fread(str, colClasses=c("integer", "numeric", "numeric", "character", "character")), ans)
test(1825.09, fread(str, colClasses=c("integer", "NULL", "numeric", "character", "character")), data.table(x1=INT(1,3), x3=c(1.5,2.5), x4=c("T","F"), x5=c("cc","ff")))
test(1825.10, fread(str, colClasses="NULL"), ans0, warning='colClasses="NULL" (quoted) is interpreted as colClasses=NULL (the default) as opposed to dropping every column')
test(1825.11, fread("a,b,c\n1,2,3.0\n2,3,4.5", colClasses = c("integer", "integer", "integer")), # 2863
data.table(a=1:2, b=2:3, c=c(3,4.5)),
warning="Attempt to override column 3 <<c>> of inherent type 'float64' down to 'int32'")
test(1825.12, fread("a,b,c\n1,2,3.0\n2,3,4.5", colClasses=list(NULL="b")), data.table(a=1:2, c=c(3,4.5)))
test(1825.13, fread(str, colClasses=list(integer=1:2, NULL=3:5)), data.table(x1=INT(1,3), x2=INT(2,4)))
test(1825.14, fread(str, colClasses=list(numeric=2, NULL=3:5), drop=1), data.table(x2=c(2,4)))
test(1825.15, fread(str, colClasses=(cl<-list(numeric=2, NULL=3:5)), drop=cl$`NULL`), data.table(x1=INT(1,3), x2=c(2,4))) # cover commit f0bd6e3
# NULL didn't work in 1.11.0-1.11.8 so some usage exists where drop= is used to respecify the NULLs. The warning could be reintroduced in future.
# https://github.com/Rdatatable/data.table/issues/3233#issuecomment-453674647
test(1825.16, fread(str, colClasses=c("integer","integer","NULL","character","NULL"), drop=3), data.table(x1=INT(1,3), x2=INT(2,4), x4=c("T","F")))
test(1825.17, fread(str, colClasses=c("integer","numeric","NULL","character","NULL"), drop=3:4), data.table(x1=INT(1,3), x2=c(2,4)))
test(1825.18, fread(str, drop=6), data.table(x1=INT(1,3), x2=INT(2,4), x3=c(1.5,2.5), x4=c("T","F"), x5=c("cc","ff")), warning="drop[1] is 6 which is out of range [1,ncol=5]")
# extra tests from #3143
DT = data.table(a = c(1.0, 2.0, 3.0, 4.0, 5.1), b = c("1", "2", "E", "4", "5"))
fwrite(DT, f<-tempfile())
test(1825.19, fread(f, colClasses = c(b = "integer")), DT, warning="Attempt to override.*ignored")
DT = data.table(a = c(1.0, 2.0, 3.0, 4.0, 5.0), b = c("1", "2", "E", "4", "5"))
fwrite(DT, f)
test(1825.20, fread(f, colClasses = c(b = "integer")), DT[, a:=as.integer(a)], warning="Attempt to override column 2.*ignored") # 'a' is read as integer via fwrite not via colClasses
DT = data.table(b = c("1", "2", "E", "4", "5"))
fwrite(DT, f)
test(1825.21, fread(f, colClasses = c(b = "integer")), DT, warning="Attempt to override.*ignored")
DT = data.table(a = c(1.0, 2.0, 3.0, 4.0, 5.0), b = c("1", "2", "E", "4", "5"))
fwrite(DT, f)
test(1825.22, fread(f, colClasses = c(a = "numeric", b = "integer")), DT, warning="Attempt to override column 2.*ignored")
unlink(f)
# issue 2351
set.seed(1)
DT = data.table(id=paste0("id",1:1e5), v=sample(100,1e5,replace=TRUE))
fwrite(DT, file=f<-tempfile(), eol="\r")
test(1826.1, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id99999","id100000"), v=c(27L,38L,10L,13L)))
cat("id888,42", file=f, append=TRUE) # without final \r after last line
test(1826.2, fread(f)[c(1,2,.N-1,.N)], data.table(id=c("id1","id2","id100000","id888"), v=c(27L,38L,13L,42L)))
unlink(f)
# Issue 2222
test(1827.1, fread("A,B\n1987,1\n1987,3\n", na.strings=c("1987", "NA")), data.table(A=c(NA,NA),B=c(1L,3L)))
test(1827.2, fread("A,B\n1987,1\n4,3\n", na.strings=c("1987", "NA")), data.table(A=c(NA,4L),B=c(1L,3L)))
test(1827.3, fread("A,B\n1987,1\n1987,3\n", na.strings="1987"), data.table(A=c(NA,NA),B=c(1L,3L)))
test(1827.4, fread("A,B\n1987,1\n1,3\n", na.strings="198"), data.table(A=c(1987L,1L),B=c(1L,3L)))
# Issue 2246 : reading from a table where the number of rows is not estimated correctly up-front
DT1 = data.table(A=rep(123L, 100L), B=456L)
DT2 = data.table(A=rep(1L, 200L), B=2L)
DT = rbind(DT1, DT2, DT1, DT2, DT1, DT2, DT1, DT2, DT1, DT2, DT1, DT2, DT1, DT2, DT1, DT2, DT1, DT2, DT1, DT1)
fwrite(DT, f<-tempfile())
test(1828, fread(f), DT)
# Reading hexadecimal floating point numbers
test(1829.1, fread("A\n0x1.0p0\n-0x1.0p1\n0X1.0P3\n0x1.4p3\n0x1.9p6\nNaN\nInfinity\n-Infinity"),
data.table(A=c(1, -2, 8, 10, 100, NaN, Inf, -Inf)))
test(1829.2, fread("A\n0x1.e04b81cad165ap-1\n0x1.fb47e352e9a63p-5\n0x1.fa0fd778c351ap-1\n0x1.7c0a21cf2b982p-7\n"),
data.table(A=c(0.93807607, 0.06192393, 0.98840211, 0.01159789)))
test(1829.3, fread("A\n0x0.FFFFFFFFFFFFp-1022\n0x0.0000000000001p-1022\n"), # largest/smallest subnormal numbers
data.table(A=c(2.2250738585072e-308, 4.940656458412e-324)))
test(1829.4, fread("A\n0x1.FFFFFFFFFFFFp+1023\n0x1.0000000000000p-1022\n"), # largest/smallest normal numbers
data.table(A=c(1.7976931348623e+308, 2.225073858507e-308)))
test(1829.5, fread("A,B,C,D,E,F\n0x2.0p1,0x1.333,0x1.aaaaaaaaaaaaaaaP1,0x1.ABCDEFGp1,0x1.0p-1023,0x0.1p1"),
data.table(A="0x2.0p1", B="0x1.333", C="0x1.aaaaaaaaaaaaaaaP1", D="0x1.ABCDEFGp1", E="0x1.0p-1023", F="0x0.1p1"))
# Reading "advanced" floating point literals (i.e. various NaNs / Infs)
test(1830.1, identical(
fread("A\n+Inf\nINF\n-inf\n-Infinity\n1.3e2"),
data.table(A=c(Inf, Inf, -Inf, -Inf, 130))))
test(1830.2, identical(
fread("B\n.2\nnan\nNaN\n-NAN\nqNaN\n+NaN%\nsNaN\nNaNQ\nNaNS\n-.999e-1"),
data.table(B=c(0.2, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, -0.0999))))
test(1830.3, !identical(
fread("B\n.2\nnan\nNaN\n-NAN\nqNaN\n+NaN%\nsNaN\nNaNQ\nNaNS\n-.999e-1"),
data.table(B=c(0.2, NaN, NaN, NaN, NA, NaN, NaN, NaN, NaN, -0.0999))))
test(1830.4, identical(
fread("C\n1.0\nNaN3490\n-qNaN9\n+sNaN99999\nNaN000000\nNaN000"),
data.table(C=c(1, NaN, NaN, NaN, NaN, NaN))))
test(1830.5, identical(
fread("D\n1.\n1.#SNAN\n1.#QNAN\n1.#IND\n1.#INF\n"),
data.table(D=c(1, NaN, NaN, NaN, Inf))))
test(1830.6, identical(
fread("E\n0e0\n#DIV/0!\n#VALUE!\n#NULL!\n#NAME?\n#NUM!\n#REF!\n#N/A\n1e0\n"),
data.table(E=c(0, NaN, NaN, NA, NA, NA, NA, NA, 1))))
if (test_longdouble) { #3258
test(1830.7, identical(
fread("F\n1.1\n+1.333333333333333\n5.9e300\n45609E11\n-00890.e-003\n"),
data.table(F=c(1.1, 1.333333333333333, 5.9e300, 45609e11, -890e-3))))
test(1830.8, identical(
fread("G\n0.000000000000000000000000000000000000000000000000000000000000449548\n"),
data.table(G=c(4.49548e-61))))
}
# Test that integers just above 128 or 256 characters in length parse as strings, not as integers/floats
# This guards against potential overflows in the count of digits
src1 = paste0(rep("1234567890", 13), collapse="") # length = 130, slightly above 128
src2 = paste0(rep("12345678900987654321", 13), collapse="") # length = 260, slightly above 256
test(1831.1, fread(paste0("A\n", src1)), data.table(A=src1))
test(1831.2, fread(paste0("A\n", src2)), data.table(A=src2))
test(1831.3, fread(paste0("A\n", src2, ".33")), data.table(A=1.2345678900987655e+259))
test(1831.4, fread(paste0("A\n", "1.", src2)), data.table(A=1.1234567890098766))
DT = as.data.table(matrix(5L, nrow=10, ncol=10))
test(1832.1, fwrite(DT, f<-tempfile(), verbose=TRUE), output="Column writers")
DT = as.data.table(matrix(5L, nrow=10, ncol=60))
# Using capture.output directly to look for the "..." because test(,output=) intercepts [] for convenience elsewhere
test(1832.2, any(grepl("^Column writers.* [.][.][.] ", capture.output(fwrite(DT, f, verbose=TRUE)))))
unlink(f)
# ensure explicitly setting select to default value doesn't error, #2007
test(1833, fread('V1,V2\n5,6', select = NULL),
data.table(V1 = 5L, V2 = 6L))
# fread file for which nextGoodLine in sampling struggles, #2404
# Every line in the first 100 has a quoted field containing sep, so the quote rule was being
# bumped to 3 (now fixed not to) which gave rise to nextGoodLine struggling later.
# The file is not finished with \n either, but that's not an issue.
if (test_R.utils) {
test(1834.1, dim(DT<-fread(testDir("grr.csv.gz"), header=FALSE)), INT(2839, 12))
test(1834.2, DT[c(1,2,.N-1,.N), c(1,2,11,12)],
data.table(V1="AAAAAAAAAA",
V2=c("AAAAAAAA","AAAAAAAAAA","AAAAAAAAAA","AAAAAAAAAA"),
V11=c("AAAAAAAAAAAAAAAA","","AAAAAAAAAA","AAAA"),
V12=c("AAAAAAAAAAAAA","","AAAAAAA","AAA")))
}
# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because
# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not.
# Would need a very complicated construction of embedded new lines in quoted fields, to test that.
# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN.
DT = as.data.table(CO2)
f = tempfile()
for (i in 0:1000) {
start = nrow(CO2)*i
fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE)
if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE)
}
test(1835, fread(f, verbose=TRUE),
output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped",
warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>")
unlink(f)
test(1836, fread('1,2,"3,a"\n4,5,"6,b"'), data.table(V1=c(1L,4L), V2=c(2L,5L), V3=c("3,a","6,b"))) # 2196
test(1837.1, fread('v1,v2,v3,v4,v5\n1,2,3,4,5', select=-1), error="negative but should be in the range.*Consider drop= for column exclusion") #2423 and #2782
test(1837.2, fread('v1,v2,v3,v4,v5\n1,2,3,4,5', select=0), error="select = 0.*has no meaning")
test(1837.3, fread('v1,v2,v3,v4,v5\n1,2,3,4,5', select=6), error="too large for this table, which only has")
test(1838, fread("default payment next month\n0.5524\n0.2483\n0.1157\n"), data.table("default payment next month"=c(0.5524,0.2483,0.1157))) #2322
# better writing and reading of NA in single column input, #2106
DT = data.table(a=c(4,NA,2,3.14,999,NA))
fwrite(DT, f<-tempfile(), na="")
test(1839.1, fread(f), data.table(a=c(4,NA,2,3.14,999,NA)))
test(1839.2, fread(f, blank.lines.skip=TRUE), data.table(a=c(4,2,3.14,999)))
test(1839.3, fread(f, fill=TRUE), data.table(a=c(4,NA,2,3.14,999,NA)))
test(1839.4, fread(f, fill=TRUE, blank.lines.skip=TRUE), data.table(a=c(4,2,3.14,999)))
fwrite(DT, f, na="NA") # base R does not do this though, it writes ,, for NAs in numeric columns (as does fwrite)
test(1839.5, fread(f, na.strings=""), data.table(a=c("4","NA","2","3.14","999","NA")))
test(1839.6, fread(f, na.strings="NA"), DT) # TOOD: auto handle (unusual, even as written by R) "NA" in numeric columns
unlink(f)
lines = c("DECLARATION OF INDEPENDENCE",
"We hold these truths to be self-evident, that all men are created equal,",
"that they are endowed by their Creator with certain unalienable Rights,",
"that among these are Life, Liberty and the pursuit of Happiness.",
"",
"That to secure these rights, Governments are instituted among Men,",
"deriving their just powers from the consent of the governed.")
txt = paste(lines, collapse="\n")
test(1839.7, fread(txt, sep=""), data.table("DECLARATION OF INDEPENDENCE"=lines[-1])) # fread should eventually be able auto-detect sep=""
# readLines behaviour, #1616
txt = 'a,b\n ab,cd,ce\n abcdef\n hjkli \n' # now auto detected as ncol 1 anyway
test(1840.1, fread(txt), data.table("a,b" = c("ab,cd,ce","abcdef","hjkli")))
write('a,b\n ab,cd,ce\nabc,def \n hj,kli ', f<-tempfile()) # write to file to generate \r\n line ending on Windows, test 1840.6 below
test(1840.2, fread(f), data.table("ab"=c("abc","hj"), "cd"=c("def","kli"), "ce"=NA), warning="Detected 3 column names but the data has 2.*Filling.*automatically")
test(1840.3, fread(f, sep=NA), error="!is.na(sep) is not TRUE")
test(1840.4, fread(f, sep=NA_character_), error="!is.na(sep) is not TRUE")
test(1840.5, fread(f, sep=""), ans<-data.table("a,b"=c("ab,cd,ce","abc,def","hj,kli")))
test(1840.6, fread(f, sep="\n"), ans)
test(1840.7, fread(f, sep=NULL), ans)
test(1840.8, fread(f, sep=NULL, strip.white=FALSE), data.table("a,b"=c(" ab,cd,ce","abc,def "," hj,kli ")))
unlink(f)
test(1841, fread("A\n0.58E-2141\n"), data.table(A="0.58E-2141")) # no ERANGE warning and so no leak, #918
DT = data.table() # 2452
test(1842, setnames(DT, character(0)), DT)
test(1843, is.sorted((0+0i)^(-3:3)), error = "type 'complex' is not yet supported") # to cover error message in forder.c
# forder consistent coverage, #2454
# test loop 1253.13* covers these cases sometimes depending on its random time-based seed
# make a minimal example where there's a group size of 2 in the 2nd column (type double) with an NA too and na.last=NA
# covers the branch in forder.c:dsort line 1070 starting: if (nalast == 0 && n == 2) {
DT = data.table(c("a","a","a","b","b"),c(2,1,3,NA,2))
test(1844.1, forder(DT,V1,V2,na.last=NA), INT(2,1,3,0,5))
DT = data.table(c("a","a","a","b","b"),c(2,1,3,2,NA))
test(1844.2, forder(DT,V1,V2,na.last=NA), INT(2,1,3,0,4)) # prior to v1.12.0 this was 2,1,3,4,0. As long as it's the same with 0's removed, think it's ok
# now with two NAs in that 2-group covers forder.c:forder line 1269 starting: else if (nalast == 0 && tmp==-2) {
DT = data.table(c("a","a","a","b","b"),c(2,1,3,NA,NA))
test(1844.3, forder(DT,V1,V2,na.last=NA), INT(2,1,3,0,0))
DT = data.table(as.raw(0:6), 7:1)
test(1844.4, forder(DT,V1,V2), error="Column 1 passed to [f]order is type 'raw', not yet supported")
test(1844.5, forder(DT,V2,V1), error="Column 2 passed to [f]order is type 'raw', not yet supported")
DT = data.table(as.raw(0:6), c(5L,5L,1L,2L,2L,2L,2L))
test(1844.6, forder(DT,V2,V1), error="Column 2 passed to [f]order is type 'raw', not yet supported")
# fix for non-equi joins issue #1991. Thanks to Henrik for the nice minimal example.
d1 <- data.table(x = c(rep(c("b", "a", "c"), each = 3), c("a", "b")), y = c(rep(c(1, 3, 6), 3), 6, 6), id = 1:11)
d2 <- data.table(id = 1:2, val = c(4, 2))
d3 <- data.table(x = rep(c("a", "b", "c"), each = 3), y = c(6, 1, 3), id = 1:9)
test(1845.1, d1[d2, id, on = .(y >= val)], INT(3,6,9,10,11,2,3,5,6,8,9,10,11))
test(1845.2, d3[d2, id, on = .(y >= val)], INT(1,4,7,1,3,4,6,7,9))
# fix for non-equi join issue #1986, when d2 is 0-row data.table, thanks @ebs238 for the nice minimal example
d1 <- data.table(a=1L, b=2L)
d2 <- d1[0L]
test(1846.1, d2[d1, on=.(a>a, b<b)], d1)
test(1846.2, d2[d1, on=.(a>a)], data.table(a=1L, b=NA_integer_, i.b=2L))
# Fix for non-equi join issue #2360.- issue in recreating new indices following non-equi binary merge
d <- data.table(
index = as.Date(c("2017-08-25", "2017-08-28", "2017-08-29", "2017-08-30", "2017-08-31", "2017-09-01",
"2017-09-05", "2017-09-06", "2017-09-07", "2017-09-08", "2017-09-11", "2017-09-12",
"2017-09-13")),
High = c(52.85, 51.81, 51.86, 52.29, 52.59, 52.77, 51.9, 50.77, 50.69, 50.45, 50.67, 51.07, 51.105),
FwdHi = c(51.81, 51.86, 52.29, 51.9, 50.77, 50.69, 50.45, 50.45, 50.45, 50.67, NA, NA, NA))
setkey(d, index)[, id := .I] # adding rowid for easily writing tests
# [1L] is to ensure non matching rows are also included, will be NA in result
ans1 <- d[d, id[which.min(x.index)[1L]], on = .(index > index), by = .EACHI]$V1
ans2 <- d[d, id[which.min(x.index)[1L]], on = .(FwdHi > High), by = .EACHI]$V1
ans3 <- d[d, id[which.min(x.index)[1L]], on = .(index > index, FwdHi > High), by = .EACHI]$V1
test(1847.1, ans1, INT(2:13,NA))
test(1847.2, ans2, INT(NA, 2:3, NA, NA, NA, 3, rep(1, 6)))
test(1847.3, ans3, INT(NA, 3:4, rep(NA, 10)))
# Adding test for #2275, same fix for #2360 above also takes care of this, i.e., with nqRecreateIndices
rand_strings = function(n) {
M = matrix( sample(11, n*5, replace=TRUE), nrow=n ) # 11 for letters [a-k]; 5 for nchar of each string
apply(M, 1, function(x) paste0(letters[x], collapse=""))
}
set.seed(123) # the random data here doesn't match the data in issue 2275 because they used stringi::stri_rand_strings which has a different RNG
n = 100000
DT1 = data.table(RANDOM_STRING = rand_strings(n),
DATE = sample(seq(as.Date('2016-01-01'), as.Date('2016-12-31'), by="day"), n, replace=TRUE))
DT2 = data.table(RANDOM_STRING = rand_strings(n),
START_DATE = sample(seq(as.Date('2015-01-01'), as.Date('2017-12-31'), by="day"), n, replace=TRUE))
DT2[, EXPIRY_DATE := START_DATE + floor(runif(1000, 200,300))]
DT1[, DT1_ID := .I][, DATE := as.Date(DATE)]
cols = c("START_DATE", "EXPIRY_DATE")
DT2[, DT2_ID := .I][, (cols) := lapply(.SD, as.Date), .SDcols=cols]
ans1 = DT2[DT1, on=.(RANDOM_STRING, START_DATE <= DATE, EXPIRY_DATE >= DATE), .N, by=.EACHI ]$N > 0L
tmp = DT1[DT2, on=.(RANDOM_STRING, DATE >= START_DATE, DATE <= EXPIRY_DATE), which=TRUE, nomatch=0L]
ans2 = DT1[, DT1_ID %in% tmp]
test(1848.1, ans1, ans2)
# Fix for #4388; related to #2275 fix
x <- data.table(id = "a", t = as.ITime(c(31140L, 31920L, 31860L, 31680L, 31200L, 31380L, 31020L, 31260L, 31320L, 31560L, 31080L, 31800L, 31500L, 31440L, 31740L, 31620L)), s = c(37.19, 37.10, 37.10, 37.10, 37.1, 24.81, 61.99, 37.1, 37.1, 37.38, 49.56, 73.89, 37.38, 24.81, 37.01, 37.38), val = c(40L, 53L, 52L, 49L, 41L, 44L, 38L, 42L, 43L, 47L, 39L, 51L, 46L, 45L, 50L, 48L))
y <- data.table(id = c("a", "b"), t1 = as.ITime(c(31020L, 42240L)), t2 = as.ITime(c(31920L, 43140L)), s1 = c(0L, 0L),
s2 = c(200, 200))
# testing that it doesn't segfault
test(1848.2, x[y, on=.(id, s >= s1, s <= s2, t >= t1, t <= t2), .(val), by=.EACHI, nomatch=0L, allow.cartesian=TRUE]$val, x$val)
# when last field is quoted contains sep and select= is used too, #2464
test(1849.1, fread('Date,Description,Amount,Balance\n20150725,abcd,"$3,004","$5,006"', select=c("Date", "Description", "Amount")),
data.table(Date=20150725L,Description="abcd",Amount="$3,004"))
test(1849.2, fread('Date,Description,Amount,Balance\n20150725,abcd,"$3,004","$5,006"', select=c("Date", "Description", "Balance")),
data.table(Date=20150725L,Description="abcd",Balance="$5,006"))
test(1849.3, fread('Date,Description,Amount,Balance\n20150725,abcd,"$3,004","$5,006"\n', select=c("Date", "Description", "Amount")),
data.table(Date=20150725L,Description="abcd",Amount="$3,004"))
test(1849.4, fread('Date,Description,Amount,Balance\n20150725,abcd,"$3,004","$5,006"\n', select=c("Date", "Description", "Balance")),
data.table(Date=20150725L,Description="abcd",Balance="$5,006"))
cat('Date,Description,Amount,Balance\n20150725,abcd,"$3,004","$5,006"', file=f<-tempfile())
test(1849.5, fread(f,verbose=TRUE),
data.table(Date=20150725L,Description="abcd",Amount="$3,004", Balance="$5,006"),
output="File ends abruptly with '\"'")
test(1849.6, fread(f, select=c("Date", "Description", "Amount")),
data.table(Date=20150725L,Description="abcd",Amount="$3,004"))
test(1849.7, fread(f, select=c("Date", "Description", "Balance")),
data.table(Date=20150725L,Description="abcd",Balance="$5,006"))
cat('\n', file=f, append=TRUE)
test(1849.8, fread(f, select=c("Date", "Description", "Amount")),
data.table(Date=20150725L,Description="abcd",Amount="$3,004"))
test(1849.9, fread(f, select=c("Date", "Description", "Balance")),
data.table(Date=20150725L,Description="abcd",Balance="$5,006"))
unlink(f)
# segfault when rbindlist is asked to create a DT with more than 2bn rows
DT = data.table(1:1e6)
L = vector("list", 2148)
for (i in seq_along(L)) L[[i]] = DT # many references to the same DT to avoid actually using large RAM for this test
test(1850, rbindlist(L), error="Total rows in the list is 2148000000 which is larger than the maximum number of rows, currently 2147483647")
rm(list=c("L","DT"))
gc()
# by=.EACHI missings to list columns, #2300
dt = data.table(a=factor(1:5, levels=1:10), b=as.list(letters[1:5]))
dt2 = data.table(a=as.factor(1:10))
test(1851.1, dt[dt2, .SD, by=.EACHI, on="a", .SDcols="b"],
data.table(a=as.factor(1:10), b={tmp=vector("list",10); tmp[1:5]=as.list(letters[1:5]); tmp}))
test(1851.2, data.table(a = 1:2, b = list("yo", NULL))[.(1:3), on=.(a), x.b, by = .EACHI],
data.table(a = 1:3, x.b = list("yo", NULL, NULL)))
# test that indices are only used on exact name match, #2465
DT1 <- data.table(colname=c("test1","test2","test2","test3"), colname_with_suffix=c("other","test","includes test within","other"))
DT2 <- data.table(lookup=c("test1","test2","test3"), lookup_result=c(1,2,3))
DT1[colname_with_suffix == "not found", ] # automatically creates index on colname_with_suffix
target <- data.table(colname = c("test1", "test2", "test2", "test3"), colname_with_suffix = c("other", "test", "includes test within", "other"), lookup_result = c(1,2,2,3))
target[colname_with_suffix == "not found", ]
test(1852, DT1[DT2, lookup_result := i.lookup_result, on=c("colname"="lookup")], target)
# test that joins don't change row order when there is an index with additional columns present, #2559
dt <- data.table(x = c(1,1), y = c(2,1))
setindex(dt, x, y)
test(1852.1, dt[J(x=1), on = "x==x"], setindex(dt, NULL))
# NA column names and missing new argument to setnames, #2475
DT = setNames(data.frame(a = 1, b = 2, c = 3, d = 4), c(NA, "b", "c", NA))
setnames(DT, c('a', 'b', 'c', 'd'))
test(1853, names(DT), c('a', 'b', 'c', 'd'))
# CJ bug with multiple empty vectors (#2511)
test(1854.1, data.frame(CJ(x = integer(0))), setattr(expand.grid(x = integer(0)), "out.attrs", NULL))
test(1854.2, data.frame(CJ(x = integer(0), y = character(0))), setattr(expand.grid(x = integer(0), y = character(0)), "out.attrs", NULL))
test(1854.3, data.frame(CJ(x = integer(0), y = c("a", "b"))), setattr(expand.grid(x = integer(0), y = c("a", "b")), "out.attrs", NULL))
test(1854.4, data.frame(CJ(x = integer(0), y = character(0), z = logical(0))), setattr(expand.grid(x = integer(0), y = character(0), z = logical(0)), "out.attrs", NULL))
test(1854.5, data.frame(CJ(x = character(0), y = NA_real_)), setattr(expand.grid(x = character(0), y = NA_real_), "out.attrs", NULL))
if (test_bit64) {
test(1854.6, data.frame(CJ(x = integer64(0), y = as.integer64(2))), setattr(expand.grid(x = integer64(0), y = as.integer64(2)), "out.attrs", NULL))
}
# Blank lines are valid CSV format for single column datasets and should read smoothly by default, #2516.
# Only for ncol>1 are blank lines not valid (correct number of commas should be present) and so a warning (at least) should occur for those.
# This means trailing newline are now significant for single column data (so that fread(fwrite(DT))==DT), but not significant when ncol>1.
test(1855.1, fread("A\n"), data.table(A=logical()))
test(1855.2, fread("A\n\n"), data.table(A=NA))
test(1855.3, fread("A\n\n\n"), data.table(A=c(NA,NA)))
test(1855.4, fread("A\n1\n2\n\n\n3\n"), data.table(A=c(1L,2L,NA,NA,3L)))
test(1855.5, fread("A\n1\n2\n\n\n3\n\n"), data.table(A=c(1L,2L,NA,NA,3L,NA)))
test(1856.1, fread("A,B\n"), ans<-data.table(A=logical(), B=logical()))
test(1856.2, fread("A,B\n\n"), ans)
test(1856.3, fread("A,B\n\n\n"), ans)
test(1856.4, fread("A,B\n3,4\n\n\n"), data.table(A=3L, B=4L))
test(1856.5, fread("A,B\n3,4\n,\n\n\n"), data.table(A=c(3L,NA), B=c(4L,NA)))
test(1856.6, fread("A,B\n3,4\n\n5,6\n"), data.table(A=3L, B=4L), warning="Discarded single-line footer: <<5,6>>")
if (test_R.utils) test(1856.7, fread(testDir("test0.txt.bz2"))[c(1,997,998,999)], data.table(x0=c(656609L, NA, -2368L, 955199L))) # issue 2515
DTs = list( # passed fread(fwrite(DT))==DT before fix?
data.table(A=logical(0)), # yes
data.table(A=NA), # no
data.table(A=c(NA,NA)), # no
data.table(A=c(1L,2L,NA,NA,3L)), # no
data.table(A=c(1L,2L,NA,NA,3L,NA)), # no
data.table(A=logical(0), B=logical(0)), # yes
data.table(A=3L, B=4L), # yes
data.table(A=c(3L,NA), B=c(4L,NA)), # yes
data.table(A=c(3L,NA,5L), B=c(4L,NA,6L)), # yes
data.table(A=c(3L,NA,5L,NA), B=c(4L,NA,6L,NA)) # yes
)
f = tempfile()
for (i in seq_along(DTs)) {
fwrite(DTs[[i]], file=f)
test(1857.0 + i/100, fread(f), DTs[[i]])
}
unlink(f)
# quoted fields followed by whitespace, #2520
test(1858, fread('B,C\n"12" ,15\n"13" ,18\n"14" ,3'), data.table(B=c(12L, 13L, 14L), C=c(15L, 18L, 3L)))
test(1859, fread("A\n", nrows=0), data.table(A=logical())) # 2512
test(1860, fread("A,B\n "), data.table(A=logical(), B=logical())) # 2543
# That unique(DT) returns DT when there are no dups, #2013
# #3383 fix for unique.data.table returning copy(x)
DT = data.table(A=c(1L,1L,2L), B=c(3L,4L,4L))
test(1861, address(unique(DT)) != address(DT), TRUE)
# New warning for deprecated old behaviour option
setkey(DT,A)
test(1862.1, unique(DT), DT)
test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A"))
# fix for -ve indices issue in gmedian (2046) and gvar (2111)
DT = data.table(x=c(1,1,1),y=c(3,0,0), key="x")
test(1863.1, DT[, median(y), by=x], data.table(x=1, V1=0, key="x"))
DT = data.table(col1 = c(1,1,1, 2,2,2), col2 = c(2,2,2,1,1,1), ID = c(rep(1,3), rep(2,3)), key="ID")
test(1863.2, DT[, lapply(.SD, var), by=ID], data.table(ID=c(1,2), col1=0, col2=0, key="ID"))
# Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826
utf8_strings = c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de")
latin1_strings = iconv(utf8_strings, from = "UTF-8", to = "latin1")
mixed_strings = c(utf8_strings, latin1_strings)
DT1 = data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1:10)
DT2 = copy(DT1)
setkey(DT1, x)
setkey(DT2, y)
# the ans is generated by `sort(c(utf8_strings, utf8_strings), method = "radix")`
# but we should not use radix sort in the test because it's introduced after R3.3.0
ans = c("El. pa\u00c5\u00a1tas", "El. pa\u00c5\u00a1tas", "fa\u00e7ile", "fa\u00e7ile",
"\u00a1tas", "\u00a1tas", "\u00de", "\u00de", "\u00e7ile", "\u00e7ile")
test(1864.1, DT1$x, ans)
test(1864.2, DT2$y, ans)
ans = c(1L, 6L, 2L, 7L, 3L, 8L, 4L, 9L, 5L, 10L)
test(1864.3, DT1[c(utf8_strings, latin1_strings), z], c(ans, ans))
test(1864.4, DT2[c(utf8_strings, latin1_strings), z], c(ans, ans))
# memory exception under asan if there's an extra comma out-of-sample, #2523
data = rep("a,b,c,d,e,f,g", 2100)
data[111] = "a,b,c,d,e,f,g,"
cat(data, file=(f<-tempfile()), sep="\n")
test(1865, fread(f, header=FALSE),
data.table(V1=rep("a",110),V2="b",V3="c",V4="d",V5="e",V6="f",V7="g"),
warning="Stopped early on line 111. Expected 7.*found 8.*discarded.*<<a,b,c,d,e,f,g,>>")
unlink(f)
# "Natural" provision of value.name in measure.vars list, #1547 and #2551
DT = data.table(
meas1_jan = 0.45, meas1_feb = 0.38, meas1_mar = 0.62,
meas2_jan = 0.42, meas2_feb = 0.48, meas2_mar = 0.46,
meas3_jan = 0.54, meas3_feb = 0.47
)
DTout = data.table(
variable = factor(1:3),
jan = c(0.45, 0.42, 0.54),
feb = c(0.38, 0.48, 0.47),
mar = c(0.62, 0.46, NA)
)
test(1866.1, melt(DT, measure.vars = patterns(jan="_jan", feb="_feb", mar="_mar")), DTout)
mvlist = list(
jan = sprintf('meas%d_jan', 1:3),
feb = sprintf('meas%d_feb', 1:3),
mar = sprintf('meas%d_mar', 1:2)
)
test(1866.2, melt(DT, measure.vars = mvlist), DTout)
test(1866.3, melt(DT, measure.vars = mvlist, value.name = c('a', 'b', 'c')),
DTout, warning = 'value.name.*given precedence')
names(mvlist) = NULL
names(mvlist)[1L] = 'jan' # NA names
test(1866.4, melt(DT, measure.vars = mvlist), error = 'Please provide a name')
names(mvlist) = NULL
names(mvlist) = c('jan', '', '') #partially-missing names
test(1866.5, melt(DT, measure.vars = mvlist), error = 'Please provide a name')
# previously untested behavior used in ?patterns
DT = data.table(x1=1:5, x2=6:10, y1=letters[1:5], y2=letters[6:10])
DTout = data.table(
variable = factor(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L)),
value1 = 1:10,
value2 = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j")
)
test(1866.6, melt(DT, measure.vars = patterns("^x", "^y", cols=names(DT))), DTout)
# auto fill too few column names (#1625) and auto fill=TRUE when too many column names
test(1867.01, fread("A,B\n1,3,5,7\n2,4,6,8\n"), data.table(A=1:2, B=3:4, V3=5:6, V4=7:8),
warning="Detected 2 column names but.*4.*Added 2 extra default column names at the end[.]")
test(1867.02, fread("A,B,C,D,E\n1,3,5,7\n2,4,6,8\n"), data.table(A=1:2, B=3:4, C=5:6, D=7:8, E=NA),
warning="Detected 5.*but.*4.*Filling rows automatically.")
if (test_R.utils) {
test(1867.03, fread(testDir("fillheader.csv.bz2"))[c(1,.N), c(1,29,30)], data.table("V1"="Ashburton District", EASTING=c(5154177L,5144032L), NORTHING=NA),
warning="Detected 29.*but.*30.*Added 1 extra default column name.*guessed to be row names or an index.*valid file")
# in this unusual case, every data row has a trailing comma but the column names do not. So the guess is wrong; a deliberate choice currently.
}
test(1867.04, fread("A,B\nCol1,Col2,Col3\n1,3,5\n2,4,6\n"), data.table(Col1=1:2, Col2=3:4, Col3=5:6))
test(1867.05, fread("A\nCol1,Col2\n1,3,5\n2,4,6\n"), data.table(V1=1:2, Col1=3:4, Col2=5:6), warning="Added 1 extra default column name.*guessed to be row names or an index")
test(1867.06, fread("Some header\ninfo\nCol1,Col2,Col3\n1,3,5\n2,4,6\n"), data.table(Col1=1:2, Col2=3:4, Col3=5:6))
test(1867.07, fread("Some header\ninfo\n\nCol1,Col2\n1,3,5\n2,4,6\n"), data.table(V1=1:2, Col1=3:4, Col2=5:6), warning="Added 1 extra")
test(1867.08, fread("A,B,C,D,E\n1,3,5\n2,4,6\n"), data.table(A=1:2, B=3:4, C=5:6, D=NA, E=NA), warning="Detected 5.*but.*3.*Filling rows automatically")
test(1867.09, fread("Heading text\nA,B,C,D,E\n1,3,5\n2,4,6\n"), data.table(A=1:2, B=3:4, C=5:6, D=NA, E=NA), warning="Detected 5.*but.*3.*Filling rows automatically")
test(1867.10, fread("Heading text\n1,3,5\n2,4,6\n"), data.table("Heading text"=1:2, V2=3:4, V3=5:6), warning="Added 2 extra default column names at the end")
test(1867.11, fread("A,B\n\n1,3,5\n2,4,6\n"), data.table(V1=1:2, V2=3:4, V3=5:6))
test(1867.12, fread("A\n1,3\n2,4\n"), data.table(V1=1:2, A=3:4), warning="Added 1 extra default column name")
# test from #763, covers #1818 too
DT = data.table(x=rep(c("a","b","c"),each=3), y=c(1L,3L,6L), v=10:18)
write.table(DT, file = (f<-tempfile()), sep = "\t")
test(1867.13, fread(f), data.table(V1=1:9, x=DT$x, y=DT$y, v=DT$v), warning="Added 1 extra default column name")
unlink(f)
# test(1867.14, fread(testDir("iterations.txt.bz2")))
# non equi joins bug fix #2313
dt <- data.table(
patient.id = c(1L, 2L, 1L, 1L, 2L, 2L, 2L),
h.date = as.Date(c("2013/10/15", "2014/10/15", "2015/7/16", "2016/1/7",
"2015/12/20", "2015/12/25", "2016/2/10")))
setorder(dt)
dt[, `:=`(start.date = h.date - 365, end.date = h.date)]
# This line below would error without this fix
ans <- dt[dt, on = .(patient.id, h.date >= start.date, h.date <= end.date),
.(patient.id, i.start.date, i.end.date, g = .GRP, .N, x.h.date),
by=.EACHI]
test(1868, "x.h.date" %in% names(ans), TRUE)
# \r\r\r in single column files, #2542
test(1869.1, fread("A\r1\r\r\r2\r"), data.table(A=c(1L,NA,NA,2L)))
test(1869.2, fread("A\r1\r\r\r2\r\r"), data.table(A=c(1L,NA,NA,2L,NA)))
test(1869.3, fread("A\r1\r\r\r2\r\r\r"), data.table(A=c(1L,NA,NA,2L,NA,NA)))
test(1869.4, fread("A,B\r2,3\r,\r,\r4,5\r\r"), data.table(A=c(2L,NA,NA,4L), B=c(3L,NA,NA,5L)))
test(1869.5, fread("A,B\r2,3\r\r,\r2,4\r\r"), data.table(A=2L, B=3L), warning="Stopped.*line 3. Expected 2 fields but found 0.*First discarded non-empty line: <<,>>") # two line footer because of the comma
if (test_R.utils) {
test(1869.6, fread(testDir("colnames4096.csv.bz2"), verbose=TRUE)[,c(1,2,585,586)],
data.table(Foo000=logical(), Bar001=logical(), Foo584=logical(), B=logical()),
output = "Copying file in RAM.*file is very unusual.*ends abruptly.*multiple of 4096")
test(1869.7, fread(testDir("onecol4096.csv.bz2"), verbose=TRUE)[c(1,2,245,246,249,255:.N),],
data.table(A=c("FooBarBazQux000","FooBarBazQux001","","FooBarBazQux245","","FooBarBazQux254","FooBarBazQux","FooBarBaz12","FooBarBazQux256","","","")),
output = "Copying file in RAM.*file is very unusual.*one single column, ends with 2 or more end-of-line.*and is a multiple of 4096")
}
# better colname detection by comparing potential column names to the whole sample not just the first row of the sample, #2526
test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(A=NA, "100"=c(300L,500L), "200"=c(400L,600L)))
test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(A=NA, "100"=c(NA,500L), V3=c(NA,600L)))
test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4)))
# nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267
txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n"
test(1871.01, fread(txt), data.table(V1=2L, V2=3L, V3=4L), warning="Stopped early on line 3. Expected 3 fields but found 4.*First discarded.*V4, V5")
test(1871.02, fread(txt, skip=2), ans<-data.table(V4=INT(4,8), V5=INT(5,9), V6=INT(6,10), V7=INT(7,11)))
test(1871.03, fread(txt, skip=2, nrows=1L), ans[1,])
test(1871.04, fread(txt, skip=2, nrows=3L), ans)
test(1871.05, fread(txt, skip=3), ans <- data.table(V1=INT(4,8), V2=INT(5,9), V3=INT(6,10), V4=INT(7,11)))
test(1871.06, fread(txt, skip=3, nrows=1L), ans[1,])
test(1871.07, fread(txt, nrows=1), data.table(V1=2L, V2=3L, V3=4L))
test(1871.08, fread(txt, skip=0), data.table(V1=2L, V2=3L, V3=4L), warning="Stopped early.*line 3. Expected 3 fields but found 4.*discarded.*<<V4, V5, V6, V7>>")
test(1871.09, fread(txt, skip=0, nrows=1), ans<-data.table(V1=2L, V2=3L, V3=4L))
test(1871.10, fread(txt, skip=0, nrows=1, header=TRUE), ans)
test(1871.11, fread(txt, skip=0, nrows=1, header=FALSE), data.table(V1="V1", V2="V2", V3="V3"))
test(1871.12, fread(txt, skip=0, nrows=2, header=FALSE), data.table(V1=c("V1","2"), V2=c("V2","3"), V3=c("V3","4")))
test(1871.13, fread("A\n100\n200", verbose=TRUE), data.table(A=c(100L,200L)), output="All rows were sampled since file is small so we know nrow=2 exactly")
test(1871.14, fread("col1, col2, col3\n1, 2, 3\n3, 5, 6\n7, 8, 9\n\nsome text to ignore", nrows = 3L), data.table(col1=INT(1,3,7), col2=INT(2,5,8), col3=INT(3,6,9))) # from #1671 (no warning expected)
for (i in 100:1) {
lines <- paste(c(rep("2,3,4",i), "2,3"), collapse='\n')
test(1871.2 + (100-i)/1000, fread(lines, nrows=i), data.table(V1=rep.int(2L,i), V2=3L, V3=4L))
}
# miscellaneous missing tests uncovered by CodeCov difference
# in the process of PR #2573
## data.table cannot recycle complicated types
short_s4_col = getClass("MethodDefinition")
test(1872.01, data.table(a = 1:4, short_s4_col), error="attempt to replicate an object of type 'S4'")
## i must be a data.table when on is specified
DT = data.table(a = 1:3)
test(1872.02, DT[c(TRUE, FALSE), on = 'coefficients'], error = "not a data.table, but 'on'")
## missing tests for round.IDate
test_dates = c(
"2017-01-05", "2017-08-04", "2017-06-05", "2017-04-15",
"2017-06-11", "2017-10-04", "2017-04-19", "2017-01-11",
"2017-03-08", "2017-10-10"
)
test_dates = as.IDate(test_dates)
test(1872.03, round(test_dates, 'weeks'),
structure(c(17167L, 17377L, 17321L, 17272L, 17328L,
17440L, 17272L, 17174L, 17230L, 17447L),
class = c("IDate", "Date")))
test(1872.04, round(test_dates, 'months'),
structure(c(17167L, 17379L, 17318L, 17257L, 17318L,
17440L, 17257L, 17167L, 17226L, 17440L),
class = c("IDate", "Date")))
test(1872.05, round(test_dates, 'quarters'),
structure(c(17167L, 17348L, 17257L, 17257L, 17257L,
17440L, 17257L, 17167L, 17167L, 17440L),
class = c("IDate", "Date")))
test(1872.06, round(test_dates, 'years'),
structure(c(17167L, 17167L, 17167L, 17167L, 17167L,
17167L, 17167L, 17167L, 17167L, 17167L),
class = c("IDate", "Date")))
test(1872.07, round(test_dates, 'centuries'),
error = 'should be one of')
## missing a test of mday
test(1872.08, mday(test_dates),
c(5L, 4L, 5L, 15L, 11L, 4L, 19L, 11L, 8L, 10L))
## META TEST of helper function compactprint from test.data.table
DT = data.table(a = 1, b = 2, key = 'a')
DT_out = gsub('\\s+$', '', capture.output(compactprint(DT)))
test(1872.09, DT_out,
c(" a b [Key=a Types=dou,dou Classes=num,num]",
"1: 1 2"))
## Test as-yet unimplemented features of foverlaps
x = data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10)
y = data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3)
setkey(y, start, end)
test(1872.10, foverlaps(x, y, maxgap = 2), error = 'maxgap and minoverlap.*not yet')
test(1872.11, foverlaps(x, y, minoverlap = 2), error = 'maxgap and minoverlap.*not yet')
## tests of verbose output
### foverlaps
test(1872.12, foverlaps(x, y, verbose = TRUE),
output = 'unique.*setkey.*operations.*binary search')
### [.data.table
X = data.table(x=c("c","b"), v=8:7, foo=c(4,2))
DT = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
test(1872.13, DT[X, on=.(x, v>=v), verbose = TRUE],
output = 'Non-equi join operators.*forder took.*group lengths.*done.*non-equi group ids.*done')
# out-of-sample bump from int to quoted field containing comma, #2614
DT = data.table(A=rep(10L, 2200), B="20")
DT[111, B:="3,456"]
fwrite(DT,f<-tempfile())
test(1873, fread(f), DT)
unlink(f)
# Better jump sync and run-on in PR#2627
#
# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627
# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly
x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184)
x[51094]=""
cat(x, file=f<-tempfile(), sep="\n")
test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),],
data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")),
output="jumps=[0..2)") # ensure jump 1 happened
#
# out-of-sample short lines in the first jump, not near the jump point
x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
x[5021:5041] = "small,batch,short,lines" # 4 fields not 5
cat(x, file=f, sep="\n")
test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020),
warning="Stopped early on line 5021.*<<small,batch,short,lines>>")
test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),],
data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"),
V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"),
V3=c("KLMN","KLMN","short","short","KLMN","KLMN"),
V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"),
V5=c(1L,5020L,NA,NA,5042L,102184L)),
output="jumps=[0..2)")
#
# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267
# confirmed fails in master with that error before PR#2627
x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
x[51094:51150] = "small,batch,short,lines" # 4 fields not 5
cat(x, file=f, sep="\n")
test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093),
warning="Stopped early on line 51094.*<<small,batch,short,lines>>",
output="jumps=[0..2)")
test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),],
data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"),
V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"),
V3=c("KLMN","KLMN","short","short","KLMN","KLMN"),
V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"),
V5=c(1L,51093L,NA,NA,51151L,102184L)),
output="jumps=[0..2)")
#
# jump inside a quoted field containing many new lines, to simulate a dirty jump
# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too.
# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps.
x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093"
cat(x, file=f, sep="\n")
test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n
data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"),
V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)),
output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)")
# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's
# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth)
unlink(f)
test(1876, fread("http://hkhfsk\nhttp://fhdkf\nhttp://kjfhskd\nhttp://hfkjf", header=FALSE), # data not a download, #2531
data.table(V1=c("http://hkhfsk","http://fhdkf","http://kjfhskd","http://hfkjf")))
# segfault with setattr() of "class" attribute to 0-length value, #2386
test(1877.1, attr(setattr(data.table(x = 1:10), "class", NULL), "class"), NULL) # ok before
test(1877.2, attr(setattr(data.table(x = 1:10), "class", character()), "class"), NULL) # now ok (was segfault)
test(1877.3, attr(setattr(data.table(x = 1:10), "test", character()), "test"), character(0)) # ok before
# In dev 1.10.5 these were parsed as floats, #2625. Caught before release to CRAN.
test(1878, fread("A,B,C,D,E\n.,+.,.e,.e+,0e\n"), data.table(A=".", B="+.", C=".e", D=".e+", E="0e"))
# assortment of tests from #2572
## negative indexing should retain key
DT = data.table(a = c(5, 5, 7, 2, 2),
b = 1:5, key = 'a')
test(1879.1, key(DT[-c(2, 3)]), 'a')
test(1879.2, key(DT[-(1:5)]), 'a')
test(1879.3, key(DT[-2, sum(b), by = a]), 'a')
## behavior of out-of-bound subsets
## (mixed +/- already covered in 1043)
test(1879.4, DT[3:6],
data.table(a = c(5, 5, 7, NA),
b = c(1L, 2L, 3L, NA)))
test(1879.5, DT[0:5], DT)
## if fread bumps logical to character,
## the original string representation should be kept
DT = data.table(A=rep("True", 2200), B="FALSE", C='0')
DT[111, LETTERS[1:3] := .("fread", "is", "faithful")]
fwrite(DT, f<-tempfile())
test(1879.6, fread(f, verbose=TRUE, logical01=TRUE), DT,
output="Column 1.*bumped from 'bool8' to 'string'.*\nColumn 2.*bumped from 'bool8' to 'string'.*\nColumn 3.*bumped from 'bool8' to 'string'")
unlink(f)
# Fix duplicated names arising in merge when by.x in names(y), PR#2631, PR#2653
# 1880.1 should fail in there are any duplicate names after a join
# 1880.2 should fail if a warning is not thrown when suffixes leads to duplicate names
# 1880.3 tests no.dups = FALSE, where names should be duplicated after the join
parents = data.table(name=c("Sarah", "Max"), sex=c("F", "M"), age=c(41, 43))
children = data.table(parent=c("Sarah", "Max", "Max"),
name=c("Oliver", "Sebastian", "Michelle"),
sex=c("M", "M", "F"), age=c(5,8,7))
joined = merge(parents, children, by.x="name", by.y="parent")
test(1880.1, length(names(joined)), length(unique(names(joined))))
test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L,
warning = "column names.*are duplicated in the result")
joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE))
test(1880.3, any(duplicated(names(joined))), TRUE)
# out-of-sample quote rule bump, #2265
DT = data.table(A=rep("abc", 10000), B="def")
DT[110, A:='"a"b']
fwrite(DT, f<-tempfile(), quote=FALSE)
test(1881.1, ans<-fread(f), DT, warning='Found and resolved improper quoting out-of-sample. First healed line 111: <<"a"b,def>>')
test(1881.2, ans[110,A], '"a"b') # double-check the value of interest directly
cat("(10000 rows)\n", file=f, append=TRUE)
test(1881.3, fread(f), DT, warning=c('Discarded single-line footer: <<(10000 rows)>>',
'Found and resolved improper quoting out-of-sample. First healed line 111: <<"a"b,def>>'))
unlink(f)
# CJ will should fail with proper error message, #2636
test(1882.1, .Machine$integer.max, 2147483647L) # same on all platforms and very unlikely to change in R (which is good)
test(1882.2, ceiling(.Machine$integer.max^(1/3)), 1291)
v = seq_len(1291L)
test(1882.3, CJ(v, v, v), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647")
# no re-read for particular file, #2509
if (test_R.utils) test(1883, fread(testDir("SA2-by-DJZ.csv.gz"), verbose=TRUE, header=FALSE)[c(1,2,1381,.N),],
data.table(V1=c("Goulburn","","",""), V2=c("110018063","110018064","0&&&&&&&&","0@@@@@@@@"), V3=INT(3499,812,250796,7305367), V4=NA),
warning='Stopped early on line 1394.*First discarded non-empty line: <<"Dataset: 2011 Census of Population and Housing">>',
output="Rereading 0 columns")
# sep=NULL with quoted fields, #2548
test(1884, fread('"A","B"\n', sep=NULL), data.table('"A","B"'=logical()))
# sep=' ' and blank.lines.skip, #2535
test(1885.1, fread(txt<-"a b 2\nc d 3\n\ne f 4\n", blank.lines.skip=TRUE), ans<-data.table(V1=c("a","c","e"), V2=c("b","d","f"), V3=2:4))
test(1885.2, fread(txt, blank.lines.skip=TRUE, fill=TRUE), ans)
test(1885.3, fread(txt, fill=TRUE), ans[c(1,2,NA,3),][3,1:2:=""])
test(1885.4, fread(txt, fill=TRUE, na.strings=""), ans[c(1,2,NA,3),])
# file detected as no header automatically
# (TOOD: undoubling double quotes #1109, #1299) but otherwise, auto mode correct
test(1886, fread(testDir("quoted_no_header.csv"))[c(1,.N),list(V1,V6)], data.table(V1=c("John","Joan \"\"the bone\"\", Anne"), V6=INT(8075,123)))
# na.omit with invert & no NAs works, #2660
DT = data.table(a = 1:5)
test(1887.1, na.omit(DT), DT)
test(1887.2, na.omit(DT, invert=TRUE), DT[0L])
DT = fread(",2,3\n1,,3\n1,2,\n") # all rows contain an NA, #2784
test(1887.3, na.omit(DT), DT[0L])
test(1887.4, na.omit(DT, invert=TRUE), DT)
x = runif(1e4)
test(1888, fsort(x), base::sort(x))
test(1888.1, fsort(x, decreasing = TRUE), base::sort(x, decreasing = TRUE),
warning = "New parallel sort has not been implemented for decreasing=TRUE.*one thread")
x <- c(x, NA_real_)
test(1888.2, fsort(x, na.last = TRUE), base::sort(x, na.last = TRUE),
warning = "New parallel sort has not been implemented for vectors containing NA values so far.*Using one thread")
test(1888.3, fsort(x, na.last = FALSE), base::sort(x, na.last = FALSE),
warning = "New parallel sort has not been implemented for vectors containing NA values so far.*Using one thread")
test(1888.4, fsort(x, decreasing = TRUE, na.last = TRUE), base::sort(x, decreasing = TRUE, na.last = TRUE),
warning = "New parallel sort has not been implemented for decreasing=TRUE so far.*Using one thread")
x <- as.integer(x)
test(1888.5, fsort(x), base::sort(x, na.last = FALSE),
warning = "Input is not a vector of type double. New parallel sort has only been done for double vectors so far.*Using one thread")
x = runif(1e6)
test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 5 MSB counts")
test(1888.7, !base::is.unsorted(y))
test(1888.8, fsort(x,verbose=1), error="verbose must be TRUE or FALSE")
rm(x)
# doubling of savetl buffer (currently starts with 100)
x = as.character(as.hexmode(1:1000))
for (i in x) assign(i, 1L)
test(1889, chmatch(x,x), 1:1000)
rm(list=x)
gc()
# test DT$.<- in a data.table-unaware package
DT = data.table(A=1:5)
test(1890.1, stats::ts.plot(gpars=DT), error="object must have one or more observations")
# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col()
test(1890.2, DT, data.table(A=1:5))
# na="" default, #2524
test(1891.1, fread('A,B,C\n1,foo,4\n2,,5\n3,bar,6\n', na.strings=""), data.table(A=1:3, B=c("foo",NA,"bar"), C=4:6))
test(1891.2, fread('A,B,C\n1,foo,4\n2,"",5\n3,bar,6\n', na.strings=""), data.table(A=1:3, B=c("foo","","bar"), C=4:6))
test(1891.3, fread("A,B,C\n1,foo,bar\n2", fill=TRUE, na.strings=""), data.table(A=1:2,B=c("foo",NA),C=c("bar",NA)))
test(1891.4, fread("A,B,C\n1,foo,bar\n2", fill=TRUE, na.strings="NA"), data.table(A=1:2,B=c("foo",""),C=c("bar","")))
# preserving "" and NA_character_, #2214
DT = data.table(chr = c(NA, "", "a"), num = c(NA, NA, 2L))
test(1892.1, fread({fwrite(DT,f<-tempfile());f}, na.strings=""), DT); unlink(f)
test(1892.2, capture.output(fwrite(DT, verbose=FALSE)), c("chr,num", ",", "\"\"," , "a,2"))
test(1892.3, fread('A,B\n1,"foo"\n2,\n3,""\n', na.strings="")$B, c("foo", NA, "")) # for issue #2217
# print(DT) should print NA in character columns using <NA> like base R to distinguish from "" and "NA"
DT = data.table(A=1:4, B=c("FOO","",NA,"NA"))
test(1893.1, print(DT), output=txt<-c(" A B", "1: 1 FOO", "2: 2 ", "3: 3 <NA>", "4: 4 NA"))
DF = as.data.frame(DT)
rownames(DF) = paste0(rownames(DF),":")
test(1893.2, print(DF), output=txt)
txt = 'A,B\n109,MT\n7,N\n11,NA\n41,NB\n60,ND\n1,""\n2,\n3,"NA"\n4,NA\n'
test(1893.3, print(fread(txt,na.strings="")), output="A B\n1: 109 MT\n2: 7 N\n3: 11 NA\n4: 41 NB\n5: 60 ND\n6: 1 \n7: 2 <NA>\n8: 3 NA\n9: 4 NA")
# .. prefix on all variables in j=, #2655 in part
DT = data.table(x=1:3, y=4:6, z=7:9)
cols = "z"
test(1894.01, DT[, ..cols], DT[,.(z)])
test(1894.02, DT[, c(..cols, "x")], DT[,.(z,x)])
test(1894.03, DT[, !..cols], DT[,.(x,y)])
..cols = "x"
test(1894.04, DT[, !..cols], DT[,.(x,y)], warning="Both 'cols' and '..cols' exist in calling scope. Please remove the '..cols' variable.*clarity")
rm(..cols)
cols = c("z","x")
test(1894.05, DT[, -..cols], DT[,.(y)])
coef = 10L
test(1894.06, DT[, y*..coef], INT(40,50,60))
test(1894.07, DT[x==2, z*..coef], 80L)
test(1894.08, DT[, sum(y*..coef), by=x%%2L], data.table(x=1:0, V1=INT(100,50)))
if (exists("z")) rm(z)
test(1894.09, DT[, sum(z)*..z], error="Variable 'z' is not found in calling scope. Looking in calling scope because this symbol was prefixed with .. in the j= parameter.")
z = 3L
test(1894.10, DT[, sum(z)*..z], 72L)
setnames(DT, "z", "..z")
test(1894.11, DT[, sum(y)*..z], INT(105,120,135))
rm(z)
test(1894.12, DT[, sum(y)*..z], INT(105,120,135))
setnames(DT, "..z", "z")
test(1894.13, DT[, sum(y)*..z], error="Variable 'z' is not found in calling scope")
..z = 4L
test(1894.14, DT[, sum(y)*..z], 60L) # 'manual' prefix (we have recommended in the past) still works, for now.
test(1895, getDTthreads(verbose=TRUE), output="num_procs.*R_DATATABLE.*thread_limit.*RestoreAfterFork")
# Non ascii missing protects on ENC2UTF8; issue #2674
utf8_strings = c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de")
latin1_strings = iconv(utf8_strings, from = "UTF-8", to = "latin1")
DT = data.table(x = sample(latin1_strings, 1000, replace=TRUE), key = "x")
# The ans below is generated by `sort(utf8_strings, method = "radix")` on R (>= 3.3.0).
# Note that you should generally avoid to call `sort(., method = "radix")` in `data.table`'s test
# because `data.table` could be used on any R version that is equal or larger than 3.1.0.
ans = c("El. pa\u00c5\u00a1tas", "fa\u00e7ile", "\u00a1tas", "\u00de", "\u00e7ile")
test(1896.1, enc2utf8(unique(DT$x)), ans)
# by, keyby should treat the string with different encoding as the same
mixed_strings = c(utf8_strings, latin1_strings)
DT = data.table(x = mixed_strings)
test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
test(1896.3, DT[, uniqueN(x)], 5L)
DT = data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1)
test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L)
test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L)
test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)
# FR#2695 -- setindexv to accept lists for multiple indices
DT = data.table(a = c(3, 2, 1, 2, 3), b = c(1, 2, 1, 1, 2))
setindexv(DT, list('a', c('a', 'b')))
test(1897.1, indices(DT), c("a", "a__b"))
test(1897.2, attributes(attr(DT, 'index')),
list(`__a` = c(3L, 2L, 4L, 1L, 5L),
`__a__b` = c(3L, 4L, 2L, 1L, 5L)))
# tests 1898.{1,2,3} for set2key etc. deprecation were removed along with those functions
# Allow column to be used as rownames when converting to matrix #2702
DT = data.table(id = letters[1:4], X = 1:4, Y = 5:8)
mat <- matrix(1:8, ncol = 2, dimnames=list(letters[1:4], c("X", "Y")))
mat2 <- matrix(c(letters[1:4], 1:8), ncol=3, dimnames=list(NULL, c("id", "X", "Y")))
mat3 <- matrix(c(letters[1:4], 1:8), ncol=3, dimnames=list(1:4, c("id", "X", "Y")))
test(1899.01, as.matrix(DT, 1), mat)
test(1899.02, as.matrix(DT, "id"), mat)
test(1899.03, as.matrix(DT, TRUE), mat)
setkey(DT, id)
test(1899.04, as.matrix(DT, TRUE), mat)
test(1899.05, as.matrix(DT, 1:4), mat3)
# errors
test(1899.06, as.matrix(DT, -1), error="as.integer(rownames)==-1 which is outside the column number range")
test(1899.07, as.matrix(DT, "Z"), error="'Z' is not a column of x")
test(1899.08, as.matrix(DT, c(1,2)), error="length(rownames)==2 but nrow(DT)==4. The rownames argument specifies a single column name or number. Consider rownames.value= instead.")
test(1899.09, as.matrix(DT, complex(1)), error="as.integer(rownames)==0 which is outside the column number range [1,ncol=3].")
# values that pass through (rownames ignored)
test(1899.10, as.matrix(DT, NA), mat2)
test(1899.11, as.matrix(DT, NULL), mat2)
test(1899.12, as.matrix(DT, FALSE), mat2)
# Warnings:
setkey(DT, id, X)
test(1899.13, as.matrix(DT, TRUE), mat, warning="rownames is TRUE but key has multiple columns")
# Check handling of cases where the data.table only has 1 row, raised by Issue #2930:
mat4 <- matrix(c("a", 1, 5), nrow=1, dimnames=list(c("x"), c("id", "X", "Y")))
test(1899.14, as.matrix(DT[1,], 1), mat[1,,drop=FALSE])
test(1899.15, as.matrix(DT[1,], "id"), mat[1,,drop=FALSE])
test(1899.16, as.matrix(DT[1,], rownames.value="x"), mat4)
test(1899.17, as.matrix(DT[1,], rownames.value=c("x", "y")), error="length(rownames.value)==2 but should be nrow(x)==1")
test(1899.18, as.matrix(DT, rownames=TRUE, rownames.value=1:nrow(DT)), error="rownames and rownames.value cannot both be used at the same time")
# index argument for fread, #2633
DT_str = c('a,b\n3,1\n2,2\n1,1\n2,1\n3,2')
test(1900.1, attributes(attr(fread(DT_str, index = 'a'), 'index')),
list(`__a` = c(3L, 2L, 4L, 1L, 5L)))
test(1900.2, attributes(attr(fread(DT_str, index = list('a,b', c('b', 'a'), 'a')), 'index')),
list(`__a__b` = c(3L, 4L, 2L, 1L, 5L),
`__b__a` = c(3L, 4L, 1L, 2L, 5L),
`__a` = c(3L, 2L, 4L, 1L, 5L)))
test(1900.3, fread(DT_str, index = 2L),
error = 'index argument.*character vector')
test(1900.4, fread(DT_str, index = list('a', 1L)),
error = 'index argument.*character vector')
# col.names applied before index
test(1900.5, fread(DT_str, col.names = c('c', 'd'), index = 'a'),
error = 'some columns are not in the data.table')
test(1900.6, attributes(attr(fread(DT_str, index = c('a', 'b')), 'index')),
list(`__a__b` = c(3L, 4L, 2L, 1L, 5L)))
# . within bquote shouldn't be swapped to list, #1912
DT = data.table(x = 1:5, y = 6:10)
test(1901.1, DT[, bquote(z==.(sum(x)))], bquote(z==.(DT[, sum(x)])))
test(1901.2, DT[, .(.(bquote(z==.(sd(x-y)))))], data.table(V1=list(bquote(z==.(DT[, sd(x-y)])))))
# check quote rule detection logic, #2744
src = '"C\\\\D"\nAB\\x20CD\\n\n"\\"one\\", \\\'two\\\', three"\n"\\r\\t\\v\\a\\b\\071\\uABCD"\n'
test(1902, fread(src, verbose=TRUE),
data.table("C\\\\D"=c("AB\\x20CD\\n", "\\\"one\\\", \\'two\\', three", "\\r\\t\\v\\a\\b\\071\\uABCD")),
output="Quote rule picked = 1")
# logical01 caused column names not be detected if logical column determines it, #2735
test(1903.1, fread(",A,B\n1,0,1\n2,0,1\n3,1,1\n", logical01=FALSE), data.table(V1=1:3, A=INT(0,0,1), B=1L))
test(1903.2, fread(",A,B\n1,0,1\n2,0,1\n3,1,1\n", logical01=TRUE), data.table(V1=1:3, A=c(FALSE,FALSE,TRUE), B=TRUE))
# whitespace around NA, and verbose output for small files, #2697
txt = 'A, B, C\n17, 34, 2.3\n3., NA, 1\nNA , 2, NA \n0,0.1,0'
test(1904.1, fread(txt, na.strings="NA", verbose=TRUE),
ans <- data.table(A=c(17,3,NA,0), B=c(34,NA,2,0.1), C=c(2.3,1.0,NA,0.0)),
output = c("Number of sampling jump points = 1 because.*Reading 1 chunks \\(0 swept\\) of 1.000MB \\(each chunk 4 rows\\) using 1 thread.*Rereading 0 columns"))
test(1904.2, fread(txt, na.strings=c("NA", " ")), ans, warning='na.strings\\[2\\]==" " consists only of whitespace, ignoring. Since strip.white=TRUE.*use.*"".*<NA>')
test(1904.3, fread(txt, na.strings=c("NA", "")), ans)
test(1904.4, fread(txt, na.strings=c("NA", "", " ")), ans, warning='na.strings\\[3\\]==" ".*only.*whitespace.*will already be read as <NA>')
test(1904.5, fread(txt, na.strings=c("NA", " "), strip.white=FALSE), error='na.strings\\[2\\]==" ".*only.*whitespace.*But strip.white=FALSE')
setnames(ans, c("A"," B"," C"))
test(1904.6, fread(txt, na.strings="NA", strip.white=FALSE), ans) # whitespace around NA strings is always stripped regardless of strip.white; just column names affected here
# should respect sep= when it is supplied, #2666
test(1905.1, fread("A;B;C\nD\nE;F", sep=";", header=FALSE, fill=TRUE),
data.table(V1=c("A","D","E"), V2=c("B","","F"), V3=c("C","","")))
test(1905.2, fread("A;B;C\nD\nE", sep=";", header=FALSE, fill=TRUE),
data.table(V1=c("A","D","E"), V2=c("B","",""), V3=c("C","","")))
test(1905.3, fread("A;B;C\n;D\nE", sep=";", header=FALSE, fill=TRUE),
data.table(V1=c("A","","E"), V2=c("B","D",""), V3=c("C","","")))
test(1905.4, fread("A;B;C\nD\n;E", sep=";", header=FALSE, fill=TRUE),
data.table(V1=c("A","D",""), V2=c("B","","E"), V3=c("C","","")))
# using rarer field separator without quoting, aside in #2755
# the self-healing rules aren't perfect, which is why they need to warn when used and nudge towards reliable usage
txt = 'A|B|C\n1|"hello", said Joe|14" pizza\n2|""howdy""|easy"\n3|\"as"|"""pie"\n'
test(1906.1, fread(txt),
data.table(A=1:3, B=c('"hello", said Joe', '"howdy"', 'as'), C=c('14" pizza', 'easy"', '""pie')),
warning='Found and resolved improper quoting.*If the fields are not quoted.*separator does not appear within any field.*try quote="" to avoid this warning')
test(1906.2, fread(txt, quote=""),
data.table(A=1:3, B=c('"hello", said Joe', '""howdy""', '"as"'), C=c('14" pizza', 'easy"', '"""pie"')))
# Issue #2395 : text field containing too many newlines
example <- data.table(column1 = 1:3, column2 = c("text", "text\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nmany new lines\n\n\n\n\n\n", "text"))
fwrite(example, file = (f<-tempfile()))
test(1907, fread(f), example)
unlink(f)
# Print topn rows properly when class=TRUE (#2803)
observed <- capture.output(print(data.table(x = seq_len(6L)), topn = 2L, class = TRUE))
expected <- c(" x", " <int>", " 1: 1", " 2: 2", "--- ", " 5: 5", " 6: 6")
test(1908, observed, expected)
# skip= is now consistent as if the file started on that line.
# Found via rev dep checking (package PhenotypeSimulator), #2786. It is still a breaking change that PhenotypeSimulator will need to accomodate please.
if (test_R.utils) {
test(1909.1, names(ans<-fread(testDir("genotypes_genome.txt.bz2"), skip="Samples:", sep=" ", colClasses="character")),
c("V1","Samples:"),
warning="Detected 1 column name.*but the data has 2 columns.*Added 1 extra default column name for the first column")
test(1909.2, ans$V1, c("POP1:","POP1:","POP1:"))
test(1909.3, nchar(ans[["Samples:"]]), INT(3287,3287,3287))
test(1909.4, names(ans<-fread(testDir("genotypes_genome.txt.bz2"), skip="POP1:", sep=" ", colClasses="character", header=FALSE)),
c("V1","V2"))
test(1909.5, ans$V1, c("POP1:","POP1:","POP1:"))
test(1909.6, nchar(ans$V2), INT(3287,3287,3287))
}
# possible regression suggested in PR#2843
DT = data.table(x=1:10, y=1:2)
test(1910, DT[, v:=cumsum(x), by="y"], data.table(x=1:10, y=1:2, v=INT(1,2,4,6,9,12,16,20,25,30)))
# testing issue #2829 (assigning to 0 rows)
DT = data.table(COL_INT = 1L, COL_INT_2 = 5L)
test(1911.1,
DT[COL_INT == 0L, c("COL_INT", "NEW_COL"):=list(COL_INT_2, "Test")],
data.table(COL_INT = 1L, COL_INT_2 = 5L, NEW_COL = NA_character_))
test(1911.2,
DT[, COL_INT := integer(0)],
error = "RHS of assignment to existing column 'COL_INT' is zero length but not NULL.*")
# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882
# This runs with 2 threads in the test suite on CRAN and AppVeyor etc.
# 2 threads are sufficient to fail before the fix.
N = 20
DF = data.frame(a=rnorm(N),
b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]),
c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]))
DT = setDT(DF) # setDT required since data.table() already expanded altrep's
before = sum(gc()[, 2])
fff = function(aref) {
ff = lapply(1:5, function(i) {
DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]]
})
return(rbindlist(ff))
}
for(i in 1:100) {
f = fff("a")
rm("f")
}
gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after`
# from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm.
after = sum(gc()[, 2])
test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up).
#
before = sum(gc()[, 2])
fff = function(aref) {
DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race.
lapply(1:5, function(i) {
DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]]
})
}
for(i in 1:100) {
fff("a")
}
gc()
after = sum(gc()[, 2])
test(1912.2, after < before + 10)
# BEGIN port of old testthat tests, #2740. Issue numbers may be from R-forge.
#
# test-data.frame-like.R (merge, subset, transform)")
#
# `x` columns are valid, #1299
d1 <- data.table(x=c(1,3,8), y1=rnorm(3), key="x")
d2 <- data.table(x=c(3,8,10), y2=rnorm(3), key="x")
ans1 <- merge(d1, d2, by="x")
ans2 <- cbind(d1[2:3], y2=d2[1:2]$y2)
setkey(ans2, x)
test(1913.01, ans1, ans2)
#
# `xkey` column names are valid in merge, #1299
d1 <- data.table(xkey=c(1,3,8), y1=rnorm(3), key="xkey")
d2 <- data.table(xkey=c(3,8,10), y2=rnorm(3), key="xkey")
ans2 <- cbind(d1[2:3], y2=d2[1:2]$y2)
setkey(ans2, xkey)
test(1913.02, merge(d1, d2, by="xkey"), ans2)
#
# one column merges work, #1241
dt <- data.table(a=rep(1:2,each=3), b=1:6, key="a")
y <- data.table(a=c(0,1), bb=c(10,11), key="a")
test(1913.03, merge(y, dt), data.table(a=1L, bb=11, b=1:3, key="a"))
test(1913.04, merge(y, dt, all=TRUE),
data.table(a=rep(c(0L,1L,2L),c(1,3,3)),
bb=rep(c(10,11,NA_real_),c(1,3,3)),
b=c(NA_integer_,1:6), key="a"))
#
# y with only a key column
y <- data.table(a=c(0,1), key="a")
test(1913.05, merge(y,dt), data.table(a=1L, b=1:3, key="a"))
test(1913.06, merge(y, dt, all=TRUE), data.table(a=rep(c(0L,1L,2L),c(1,3,3)), b=c(NA_integer_,1:6), key="a"))
#
# merging data.tables is almost like merging data.frames
d1 <- data.table(a=sample(letters, 10), b=sample(1:100, 10), key='a')
d2 <- data.table(a=d1$a, b=sample(1:50, 10), c=rnorm(10), key='a')
dtm <- merge(d1, d2, by='a', suffixes=c(".xx", ".yy"))
dtm.df <- as.data.frame(dtm)
dfm <- merge(as.data.frame(d1), as.data.frame(d2), by='a', suffixes=c('.xx', '.yy'))
test(1913.07, unname(dtm.df), unname(dfm))
test(1913.08, colnames(dtm), colnames(dfm))
#
## merge and auto-increment columns in y[x]
## merging tables that have common column names that end in *.1 gets
## tricky, because the y[x] mojo does some magic to increment the *.1
## in the x (I think) and keep *.1 in the y
x <- data.table(a=letters[1:10], b=1:10, b.1=1:10 * 10, key="a")
y <- data.table(a=letters[1:10], b=letters[11:20], b.1=rnorm(10), key="a")
M <- merge(x, y)
m <- merge(as.data.frame(x), as.data.frame(y), by="a")
test(1913.09, is.data.table(M) && !is.data.table(m))
test(1913.10, all(names(M) %in% union(names(M), names(m))))
test_no = 1913.11
for (name in names(m)) {
test_no = test_no + 0.0001
test(test_no, M[[name]], m[[name]])
}
#
# Original example that smoked out the bug
M <- data.table(a=letters[1:10], b=1:10)
m <- as.data.frame(M)
ms <- lapply(1:3, function(x) data.table(a=letters[1:10], b=1:10 * 10^x))
for (i in 1:3) {
M <- merge(M, ms[[i]], by='a', suffixes=c("", sprintf(".%d", i)))
}
for (i in 1:3) {
m <- merge(m, as.data.frame(ms[[i]]), by='a', suffixes=c("", sprintf(".%d", i)))
}
test(1913.12, is.data.table(M) && !is.data.table(m))
test(1913.13, all(names(M) %in% union(names(M), names(m))))
test_no = 1913.14
for (name in names(m)) {
test_no = test_no + 0.0001
test(test_no, M[[name]], m[[name]])
}
#
# simple subset maintains keys
dt <- data.table(a=sample(c('a', 'b', 'c'), 20, replace=TRUE),
b=sample(c('a', 'b', 'c'), 20, replace=TRUE),
c=sample(20), key='a')
sub <- subset(dt, a == 'b')
test(1913.15, key(dt), key(sub))
#
# subset using 'select' maintains key appropriately"
dt <- data.table(a=sample(c('a', 'b', 'c'), 20, replace=TRUE),
b=sample(c('a', 'b', 'c'), 20, replace=TRUE),
c=sample(20), key=c('a', 'b'))
sub.1 <- subset(dt, a == 'a', select=c('c', 'b', 'a'))
test(1913.16, key(sub.1), key(dt)) # reordering columns
sub.2 <- subset(dt, a == 'a', select=c('a', 'c'))
test(1913.17, key(sub.2), 'a') # selected columns are prefix of key
sub.3 <- subset(dt, a == 'a', select=c('b', 'c'))
test(1913.18, is.null(key(sub.3))) # selected columns do not from a key prefix
sub.4 <- subset(dt, a == 'cc')
test(1913.19, nrow(sub.4), 0L)
test(1913.20, is.null(key(sub.4)))
#
# transform maintains keys
dt <- data.table(a=sample(c('a', 'b', 'c'), 20, replace=TRUE),
b=sample(c('a', 'b', 'c'), 20, replace=TRUE),
c=sample(20), key=c('a', 'b'))
t1 <- transform(dt, d=c+4)
test(1913.21, key(t1), key(dt))
test(1913.22, t1$d, dt$c + 4) # transform was successful
t2 <- transform(dt, d=c+4, a=sample(c('x', 'y', 'z'), 20, replace=TRUE))
test(1913.23, is.null(key(t2))) # transforming a key column nukes the key
## This is probably not necessary, but let's just check that transforming
## a key column doesn't twist around the rows in the result.
test_no = 1913.24
for (col in c('b', 'c')) {
test_no = test_no + 0.0001
test(test_no, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns
}
#
# tests-S4.R (S4 Compatability)
#
suppressWarnings(setClass("Data.Table", contains="data.table")) # suppress 'Created a package name, ‘2018-05-26 06:14:43.444’, when none found'
suppressWarnings(setClass("S4Composition", representation(data="data.table")))
# data.table can be a parent class
ids <- sample(letters[1:3], 10, replace=TRUE)
scores <- rnorm(10)
dt <- data.table(id=ids, score=scores)
dt.s4 <- new("Data.Table", data.table(id=ids, score=scores))
test(1914.01, isS4(dt.s4))
test(1914.02, inherits(dt.s4, 'data.table'))
# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table
test(1914.03, isS4(shallow(dt.s4)))
## pull out data from S4 as.list, and compare to list from dt
dt.s4.list <- dt.s4@.Data
names(dt.s4.list) <- names(dt.s4)
test(1914.04, dt.s4.list, as.list(dt)) # Underlying data not identical
# simple S4 conversion-isms work
df = data.frame(a=sample(letters, 10), b=1:10)
dt = as.data.table(df)
test(1914.05, identical(as(df, 'data.table'), dt))
test(1914.06, identical(as(dt, 'data.frame'), df))
# data.table can be used in an S4 slot
dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10))
dt.comp <- new("S4Composition", data=dt)
test(1914.07, dt.comp@data, dt)
# S4 methods dispatch properly on data.table slots"
dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10))
dt.comp <- new("S4Composition", data=dt)
setGeneric("dtGet", function(x, what) standardGeneric("dtGet"))
setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data})
setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]})
test(1914.08, dtGet(dt.comp), dt) # actually
test(1914.09, identical(dtGet(dt.comp, 1), dt[[1]]))
test(1914.10, identical(dtGet(dt.comp, 'b'), dt$b))
removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev
removeClass("S4Composition")
# END port of old testthat tests
str = "Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
5.1,3.5,1.4,0.2,setosa
4.9,3,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5,3.6,1.4,0.2,setosa"
test(1915, fread(str, select = c(5, 1, 3)),
data.table(Species = c("setosa", "setosa", "setosa", "setosa", "setosa"),
Sepal.Length = c(5.1, 4.9, 4.7, 4.6, 5),
Petal.Length = c(1.4, 1.4, 1.3, 1.5, 1.4)))
# unit tests for #2169 which was already fixed in ec05f8b/src/fwrite.c#L59
test(1916.1, fwrite(data.table(foo=c(NA, TRUE, FALSE)), logical01=TRUE, na='EMPTY', verbose=FALSE),
output=c("\"foo\"","EMPTY","1","0")) # this might need to be changed after #2964 will be solved
test(1916.2, fwrite(data.table(foo=c(NA, TRUE, FALSE)), logical01=TRUE, verbose=FALSE),
output=c("foo","","1","0"))
# #732 prefixes i., x. works during join and by=.EACHI
dta <- data.table(idx=1:3, vala=4:6, key="idx")
dtb <- data.table(idx=c(1,4), valb=c(10,11), key="idx")
test(1917.1, dta[dtb, list(x.idx, sum(valb)), by=.EACHI], data.table(idx = c(1L, 4L), x.idx = c(1L, NA), V2 = c(10, 11)))
test(1917.2, dta[dtb, list(sum(x.vala), sum(i.valb)), by=.EACHI], data.table(idx = c(1L, 4L), V1 = c(4L, NA_integer_), V2 = c(10, 11)))
# min/max work for _ordered_ factors, #1947
lev = letters[1:5]
DT = CJ(V2=letters[6:8], V1=factor(lev))[-c(4:6, 10:12)]
setkey(DT, NULL)
test(1918.1, DT[, min(V1)], error='not meaningful for factors')
test(1918.2, DT[, max(V1)], error='not meaningful for factors')
## confirming base functionality works
DT[ , V1:=as.ordered(V1)]
test(1918.3, DT[, min(V1)], structure(1L, .Label = lev, class = c("ordered", "factor")))
test(1918.4, DT[, max(V1)], structure(5L, .Label = lev, class = c("ordered", "factor")))
## make sure GForce is activated
options(datatable.optimize = Inf)
test(1918.5, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor"))))
test(1918.6, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor"))))
# as.ITime.character bug for NA handling #2940
test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime"))
# wrong bmerge result if character gets coerced to factor, i is keyed, and level order in i is different from x, #2881
iris = data.table(iris)
iris$grp = rep(c('A','B'), 75L)
iris[, Species1 := factor(Species, levels=c('setosa','versicolor','virginica'), labels=c('setosa','versicolor','Virginica'))]
iSorted = data.table(Species1 = c('setosa','Virginica'), grp='B', key=c("grp","Species1"))
i = setkey(copy(iSorted),NULL)
test(1920, iris[iSorted, on = c("grp==grp", 'Species1==Species1')],
iris[i, on = c("grp==grp", 'Species1==Species1')])
# origin= ignored by as.IDate.numeric(), #2880
test(1921.1, as.IDate(1000, origin = "1930-01-01"), as.IDate("1932-09-27"))
test(1921.2, as.IDate(1000), as.IDate("1972-09-27"))
# fread returns data.table(NULL) with warning on empty file, #2898
f = tempfile()
file.create(f)
test(1922.1, fread(f), data.table(NULL), warning = 'File.*size 0')
test(1922.2, fread(file = f), data.table(NULL), warning = 'File.*size 0')
# trigger download for last instance of warning
test(1922.3, fread(paste0('file://', f)), data.table(NULL), warning = 'File.*size 0')
test(1922.4, fread(f, data.table = FALSE), data.frame(NULL), warning = 'File.*size 0')
test(1922.5, fread(file = f, data.table = FALSE), data.frame(NULL), warning = 'File.*size 0')
test(1922.6, fread(paste0('file://', f), data.table = FALSE), data.frame(NULL), warning = 'File.*size 0')
unlink(f)
#fwrite creates a file or does nothing, as appropriate, also #2898
DT = data.table(NULL)
f = tempfile()
test(1922.7, fwrite(DT, f), NULL, warning = 'no columns; creating an empty file')
## above test created a file; now test behavior when file exists
test(1922.8, fwrite(DT, f), NULL, warning = 'no columns; doing nothing.*file.remove')
## slightly different behavior if append = TRUE
test(1922.9, fwrite(DT, f, append = TRUE), NULL, warning = 'doing nothing.$')
# create index even if key present by setting attribute, #2883
DT = data.table(1:5, 1:5)
setkey(DT)
setindex(DT)
test(1923.1, indices(DT, vectors=TRUE), list(c("V1","V2")))
setindex(DT, NULL)
setindex(DT, V1)
test(1923.2, indices(DT, vectors=TRUE), list(c("V1")))
# Column typo checks, #2887
DT = data.table(varname = 1)
test(1924.1, DT[var_name==1], error='not found\\. Perhaps you intended.*varname')
test(1924.2, DT[variable==1], error='Object.*not found among')
test(1924.3, DT[varname+'a'], error='non-numeric argument')
DT[, VAR_NAME:=2]
test(1924.4, DT[var_name==1], error="Object 'var_name' not found. Perhaps you intended varname, VAR_NAME")
DT = setDT(lapply(integer(50), function(...) numeric(1L)))
test(1924.5, DT[V==0], error='Perhaps you intended.*V1.*V5 or 45 more')
# test suite of as.ITime methods (subsumes #2870)
s = c('1970-01-01 00:00:00.1234', '2005-10-12 09:45:32.84')
x = as.POSIXlt(s, tz = 'UTC')
test(1925.01, as.ITime(x), structure(c(0L, 35132L), class="ITime"))
test(1925.02, as.ITime(x, ms='nearest'), structure(c(0L, 35133L), class="ITime"))
test(1925.03, as.ITime(x, ms='ceil'), structure(c(1L, 35133L), class="ITime"))
test(1925.04, as.ITime(x, ms='foo'), error='Valid options for ms')
test(1925.05, as.ITime(s), as.ITime(x))
n = as.numeric(x)
test(1925.06, as.ITime(n), as.ITime(x))
test(1925.07, as.ITime(n, ms='nearest'), as.ITime(x, ms='nearest'))
test(1925.08, as.ITime(n, ms='ceil'), as.ITime(x, ms='ceil'))
test(1925.09, as.ITime(as.POSIXct(x)), as.ITime(x))
x = structure(c(12.345, 67.890)/86400, class='times')
test(1925.10, as.ITime(x), structure(c(12L, 67L), class="ITime"))
test(1925.11, as.ITime(x, ms='nearest'), structure(c(12L, 68L), class="ITime"))
test(1925.12, as.ITime(x, ms='ceil'), structure(c(13L, 68L), class="ITime"))
test(1936.1, fread("A,B\n1,3\n2,4", autostart=1), data.table(A=1:2, B=3:4), warning="autostart.*deprecated.*Consider skip")
if (.Platform$OS.type == "unix") test(1936.2, is.data.table(fread("ls .")))
# add helpful error to %between%
DT = data.table(A=1:10, B=3)
test(1937.1, DT[A %between% c(B,B+1)], error='RHS has length().*Perhaps you meant')
test(1937.2, DT[A %between% B], error='length 2. The first')
# that fwrite'ing a list to a file works (it broke in dev 1.11.5 and was caught before release), PR#3017
test(1938.1, fwrite(list(1:3)), NULL, output="1\n2\n3") # never broke
test(1938.2, fwrite(list(1:3), file=f<-tempfile()), NULL) # just adding file= was what broke in dev just when x is list and not data.table|frame
test(1939.3, readLines(f), as.character(1:3))
unlink(f)
test(1940, data.table(A=1:3, .SD=4:6), error=".SD.*has special meaning")
# fintersect column name internal conflict of 'y', #3034
test(1941.1, fintersect(data.table(y=1), data.table(y=2)), data.table(y=numeric()))
test(1941.2, fintersect(data.table(y=1), data.table(y=1)), data.table(y=1))
# by= and keyby= using uniqlist on key, keyby= using index and passing it to uniqlist
DT = data.table(id=c("D","A","C","A","C"), v=1:5)
setkey(DT,id)
test(1942.01, DT[,sum(v),by=id,verbose=TRUE], data.table(id=c("A","C","D"), V1=INT(6,8,1), key="id"), output="Finding groups using uniqlist on key")
DT = data.table(id1=c("D","A","C","A","C"), id2=INT(3,9,2,9,3), id3=c(3.4, 9.1, 3.4, 3.3, 9.1), v=1:5)
setindex(DT,id1)
options(datatable.use.index=TRUE)
test(1942.02, DT[,sum(v),keyby=id1,verbose=TRUE], data.table(id1=c("D","A","C"), V1=INT(1,6,8), key="id1"), output="Finding groups using uniqlist on index 'id1'")
setindex(DT,id2)
test(1942.03, DT[,sum(v),keyby=id2,verbose=TRUE], data.table(id2=INT(3,9,2), V1=INT(6,6,3), key="id2"), output="Finding groups using uniqlist on index 'id2")
setindex(DT,id3)
oldnr = getNumericRounding() # TODO: return old
setNumericRounding(0)
test(1942.04, DT[,sum(v),keyby=id3,verbose=TRUE], data.table(id3=c(3.4, 9.1, 3.3), V1=INT(4,7,4), key="id3"), output="Finding groups using uniqlist on index 'id3'")
setNumericRounding(1)
test(1942.05, DT[,sum(v),keyby=id3,verbose=TRUE], data.table(id3=c(3.4, 9.1, 3.3), V1=INT(4,7,4), key="id3"), output="Finding groups using uniqlist on index 'id3'")
setNumericRounding(oldnr)
setindex(DT, NULL)
test(1942.06, indices(DT), NULL)
setindex(DT,id1,id2)
test(1942.07, DT[,sum(v),keyby=id1,verbose=TRUE], data.table(id1=c("D","A","C"), V1=INT(1,6,8), key="id1"), output="Finding groups using uniqlist on index 'id1__id2'")
test(1942.08, DT[,sum(v),keyby=.(id1,id2),verbose=TRUE], data.table(id1=c("A","C","C","D"), id2=INT(9,2,3,3), V1=INT(6,3,5,1), key="id1,id2"), output="Finding groups using uniqlist on index 'id1__id2'")
test(1942.09, DT[,sum(v),keyby=.(id2,id1),verbose=TRUE], data.table(id2=INT(2,3,3,9), id1=c("C","C","D","A"), V1=INT(3,5,1,6), key="id2,id1"), output="Finding groups using forderv")
options(datatable.use.index=FALSE)
test(1942.10, DT[,sum(v),keyby=id1,verbose=TRUE], data.table(id1=c("D","A","C"), V1=INT(1,6,8), key="id1"), output="Finding groups using forderv")
options(datatable.use.index=TRUE)
# test coverage in uniqlist.c of Realloc when over initial 1000 uniqs, and use double to cover numeric tolerance (both in one-column case and >1 column branch)
set.seed(2)
DT = data.table(real=sample((1:1500)/1000, 10000, replace=TRUE), id=sample(letters, 1000, replace=TRUE), value=1:10000)
setkey(DT,id,real)
test(1942.11, DT[, .(list(value)), keyby=.(id,real), verbose=TRUE][c(1,6,8744,.N)],
data.table(id=c("a","a","z","z"), real=c(0.004,0.037,1.486,1.497), V1=list(9441L, c(3375L,5983L), c(4901L,5260L,7668L), 4181L), key="id,real"),
output="Finding groups using uniqlist on key")
setindex(DT,real)
test(1942.12, DT[, sum(value), keyby=real, verbose=TRUE][c(1,500,1498,.N)], data.table(real=c(0.001, 0.501, 1.499, 1.5), V1=INT(31036,37564,14792,38606), key="real"),
output="Finding groups using uniqlist on index 'real'")
# merge warning: longer object length is not a multiple of shorter object length, #3061
DT1 <- data.table(
id = c("A", "A", "A", "A", "A"),
date = c(19900321L, 19901231L, 19901231L, 19901231L, 19901231L),
period = c(19891231L, 19891231L, 19900331L, 19900630L, 19900930L),
year = c(1989L, 1989L, 1990L, 1990L, 1990L),
key = c("id", "date", "period"))
DT2 <- data.table(
id = c("A", "A", "A", "A", "A"),
date = c(19900321L, 19901231L, 19901231L, 19901231L, 19901231L),
period = c(19891231L, 19891231L, 19900331L, 19900630L, 19900930L),
year = c(1989L, 1989L, 1990L, 1990L, 1990L),
key = c("id", "date", "period"))
test(1943.1, (ans<-DT1[DT2])[,1:4], DT1) # ok before
test(1943.2, DT1[DT2, on=c("id","date","period")], ans) # ok before
test(1943.3, DT1[DT2, on=c("id","date","period","year")], ans[,1:4]) # no warning (longer object length is not a multiple)
DT1 = data.table(id=c("A","A","A"), date=1:3, val=7:9, key="id,date")
DT2 = data.table(id=c("A","A","A"), date=1:3, date2=3:1, key="id,date")
test(1943.4, DT1[DT2, on=c("id",date="date2")],
data.table(id="A", date=3:1, val=9:7, i.date=1:3)) # was invalidly keyed by id,date in 1.11.6
# keyby index where subset is present, #3062
DT = data.table(group=rep(c("A","B"),times=c(2,3)), flag=c(1,0,1,0,1), x=1:5)
test(1944.1, DT[flag == 1 & group == "B", sum(x)], 8L) # creates index
test(1944.2, indices(DT), "group__flag")
test(1944.3, DT[flag == 1, sum(x), keyby = group], # should not use index because i subset is present; index squashing not yet implemented
data.table(group=c("A","B"), V1=INT(1,8), key="group"))
set.seed(123)
N = 10
DT = data.table(group = rbinom(N, 5, 0.5), x = 1:N, flag = rbinom(N, 1, 0.9))
test(1944.4, DT[flag == 1 & group == 1, x], 6L)
test(1944.5, indices(DT), "group__flag")
test(1944.6, DT[flag == 1, sum(x), keyby = group], data.table(group=1:4, V1=INT(6,3,18,17), key="group"))
# assigning an int greater than length(levels) corruption of int, #2984
DT <- data.table(a = factor(c("A", "Z")), b = 1:4)
c <- 3L
test(1945.1, DT[1, a:=c], error=err<-"Assigning factor numbers to column 1 named 'a'. But 3 is outside the level range [1,2]")
test(1945.2, c, 3L)
test(1945.3, DT[2,1] <- c, error=err)
test(1945.4, c, 3L)
# subset a data.table containing an altrep derived from ]<-, ]]<- etc, #3051
DT = data.table(A=1:6, B=1:3)
DT[["foo"]] = 7:12
test(1946, unique(DT, by="B"), data.table(A=1:3, B=1:3, foo=7:9))
# corruption when deleting a missing column and providing i too, #3089
DT = data.table(A=1:5)
test(1947.1, DT[A<0, c('A','B'):=.(NULL, A)], error="When deleting columns, i should not be provided")
test(1947.2, DT, data.table(A=1:5))
## tests for backticks and spaces in column names of on=, #2931
DT <- data.table(id = 1:3, `counts(a>=0)` = 1:3, sameName = 1:3)
i <- data.table(idi = 1:3, ` weirdName>=` = 1:3, sameName = 1:3)
## test white spaces around operator
test(1948.01, DT[i, on = "id >= idi"], DT[i, on = "id>=idi"])
test(1948.02, DT[i, on = "id>= idi"], DT[i, on = "id>=idi"])
test(1948.03, DT[i, on = "id >=idi"], DT[i, on = "id>=idi"])
## test column names containing operators
test(1948.04, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")),
DT[i, on = "id>=idi"])
test(1948.05, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")),
DT[i, on = "id>=idi"])
test(1948.06, setnames(DT[i, on = "id >= ` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")),
DT[i, on = "id>=idi"])
test(1948.07, setnames(DT[i, on = "`counts(a>=0)`==` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")),
DT[i, on = "id==idi"])
## mixed example
test(1948.08, DT[i, on = c( id = "idi", "sameName", "`counts(a>=0)`==` weirdName>=`")], DT[i, on = "id==idi", c("id", "counts(a>=0)", "sameName")])
## testing 'eval' in on clause
test(1948.09, DT[i, on = eval(eval("id<=idi"))], DT[i, on = "id<=idi"])
## testing for errors
test(1948.10, DT[i, on = ""], error = "'on' contains no column name: . Each 'on' clause must contain one or two column names.")
test(1948.11, DT[i, on = "id>=idi>=1"], error = "Found more than one operator in one 'on' statement: id>=idi>=1. Please specify a single operator.")
test(1948.12, DT[i, on = "`id``idi`<=id"], error = "'on' contains more than 2 column names: `id``idi`<=id. Each 'on' clause must contain one or two column names.")
test(1948.13, DT[i, on = "id != idi"], error = "Invalid operators !=. Only allowed operators are ==<=<>=>.")
test(1948.14, DT[i, on = 1L], error = "'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
# helpful error when on= is provided but not i, rather than silently ignoring on=
DT = data.table(A=1:3)
test(1949.1, DT[,,on=A], error="object 'A' not found") # tests .1 to .4 amended after #3621
test(1949.2, DT[,1,on=A], error="object 'A' not found")
test(1949.3, DT[on=A], error="object 'A' not found")
test(1949.4, DT[,on=A], error="object 'A' not found")
test(1949.5, DT[1,,with=FALSE], error="j must be provided when with=FALSE")
test(1949.6, DT[], output="A.*1.*2.*3") # no error
test(1949.7, DT[,], output="A.*1.*2.*3") # no error, #3163
if (test_bit64) {
# explicit coverage of 2-column real case in uniqlist. Keeps coming up in codecov checks in PRs that don't touch uniqlist.c
DT = data.table(id=c("A","A","B","B","B"), v=as.integer64(c(1,2,3,3,4)))
test(1950, uniqlist(DT), INT(1,2,3,5))
}
# allow nomatch=NULL to work same as nomatch=0L, #857
d1 = data.table(a=1:3, b=2:4)
d2 = data.table(a=2:4, b=3:5)
test(1951.1, d1[d2, on="a", nomatch=NULL], d1[d2, on="a", nomatch=0L])
test(1951.2, d1[d2, on="b", nomatch=NULL], d1[d2, on="b", nomatch=0L])
test(1951.3, d1[d2, on=c("a","b"), nomatch=NULL], d1[d2, on=c("a","b"), nomatch=0L])
test(1951.4, d1[d2, nomatch=3], error="nomatch= must be either NA or NULL .or 0 for backwards compatibility")
# coverage of which= checks
test(1952.1, d1[a==2, which=3], error="which= must be a logical vector length 1. Either FALSE, TRUE or NA.")
test(1952.2, d1[a==2, 2, which=TRUE], error="which==TRUE.*but j is also supplied")
# 3106 -- melt patterns don't match any columns (and more coverage tests)
DT = data.table(id = 1:3, a1 = rnorm(3), a2 = rnorm(3))
test(1953.1, melt(DT, id.vars = 'id', measure.vars = patterns(a = 'a', b = 'b')),
error = 'Pattern not found')
test(1953.2, melt(DT, id.vars = 'id', measure.vars = patterns(a = 'a', b = 'b', c = 'c')),
error = 'Patterns not found')
test(1953.3, melt(DT, id.vars = 'id', measure.vars = patterns(1L)),
error = 'Input patterns must be of type character')
setDF(DT)
test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'),
error = "must be a data.table")
# appearance order of two low-cardinality columns that were squashed in pr#3124
DT = data.table(A=INT(1,3,2,3,2), B=1:5) # respect groups in 1st column (3's and 2's)
test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L))
# skip values that are not present in old, #3030
DT <- data.table(a=1, b=2, d=3)
old <- c("a", "b", "c", "d")
new <- c("A", "B", "C", "D")
test(1955.1, setnames(DT, old, new, skip_absent=TRUE), data.table(A=1, B=2, D=3))
test(1955.2, setnames(DT, old, new, skip_absent=0), error="is not") # must be TRUE or FALSE
test(1955.3, setnames(DT, "missing", "dummy", skip_absent=TRUE), DT) # all missing
test(1955.4, setnames(DT, c("D","missing","A"), c("dd","ignored","aa"), skip_absent=TRUE), data.table(aa=1, B=2, dd=3)) # different order with a missing
test(1955.5, setnames(DT, "B", "bb", skip_absent=TRUE), data.table(aa=1, bb=2, dd=3)) # none missing so skip_absent not needed
test(1955.6, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D")), error="Items of 'old' not found in column names: [miss1, miss2]. Consider skip_absent=TRUE")
test(1955.7, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D"), skip_absent=TRUE), data.table(aa=1, B=2, D=3))
# #3116 - Better error messages for missing/unmatched .SDcols
DT = data.table(a = 1:5)
test(1956.1, DT[, .SD, .SDcols = NA_character_], error = 'missing at the following')
test(1956.2, DT[, .SD, .SDcols = NA], error = 'missing at the following')
test(1956.3, DT[, .SD, .SDcols = NA_real_], error = 'missing at the following')
test(1956.4, DT[, .SD, .SDcols = 2L], error = 'out of bounds.*1.*1.*at')
test(1956.5, DT[, .SD, .SDcols = 'b'], error = 'not column names')
test(1956.6, DT[, .SD, .SDcols = 3i], error = '.SDcols should be column numbers or names')
# added brackify to utils for #3116
test(1957.1, brackify(1:3), '[1, 2, 3]')
test(1957.2, brackify(1:11), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ...]")
test(1957.3, fread("A,B\na,b\nc,d\n", stringsAsFactors=TRUE, verbose=TRUE), data.table(A=factor(c("a","c")), B=factor(c("b","d"))),
output="stringsAsFactors=TRUE converted 2 column(s): [A, B]")
# misc. coverage tests in fread
test(1958.1, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encoding' must be")
test(1958.2, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L))
test(1958.3, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L))
test(1958.4, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns')
test(1958.5, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=logical(), B=logical(), C=logical())) #2747
test(1958.6, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=logical(), B=logical(), C=logical()))
test(1958.7, fread('A,B,C,D\n"a,b",4,5,6\n"c,d",6,7\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(6,NA))) # 2547
test(1958.8, fread('A,B,C,D\n"a,b",4,5\n"c,d",6,7,8\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(NA,8)))
# Skip should work with all types of newlines #3006
eols = c("\n", "\r\n", "\r", "\n\r")
for (i in 1:4) {
eol = eols[i]
src = paste(c("A", "B", "...", ",,,,,", "c1,c2,c3", "1,2,3"), collapse=eol)
test(1959 + (i*0.1), fread(text=src, skip=4), data.table(c1=1L, c2=2L, c3=3L))
}
test(1959.5, fread("A\n\nB\n\nC\n1\n", skip=2), data.table(B=c("", "C", "1")))
test(1959.6, fread("A,B\r\r\nX,Y\r\r\nB,C\r\r\n1,2", skip=4), data.table(B=1L, C=2L))
# empty set with constant j, #3173
DT = data.table(
color = c("yellow", "red", "green", "red", "green", "red",
"yellow", "yellow", "green", "green", "green", "yellow",
"red", "yellow", "red", "green", "yellow", "red", "yellow",
"red", "green", "yellow", "green", "green"),
year = structure(c(15340, 15340, 14975, 15706, 15706, 15340,
16436, 15340, 15340, 14975, 16436, 15706,
16436, 15340, 14975, 14975, 16071, 15340,
15706, 16071, 15706, 15340, 16436, 16071), class = "Date"),
status = structure(c(4L, 3L, 4L, 3L, 2L, 1L, 3L, 4L, 4L, 3L, 4L, 4L,
4L, 4L, 1L, 3L, 3L, 2L, 1L, 2L, 3L, 4L, 2L, 4L),
.Label = c("active", "archived", "inactive", "removed"),
class = "factor"),
amount = c(1L, 4L, 2L, 3L, 1L, 5L, 1L, 1L, 4L, 2L, 3L, 1L,
5L, 4L, 2L, 2L, 4L, 3L, 3L, 2L, 4L, 4L, 1L, 2L),
value = c(2.5, 2, 3, 3, 2.5, 3.5, 2.5, 3.5, 3, 2.5, 3.5, 2.5, 2,
2.5, 3, 3, 3, 3, 3, 3, 2, 2.5, 3, 3)
)
ans = groupingsets(DT[ , .(amount, value)], j = 5, by = character(0L), sets = list(character()), id=TRUE)
# this test covers #3267 as well -- internal logic here
# relies on an edge case of strtoi to have the same result across platforms;
# side-stepped that edge case internally now
test(1960, ans, data.table(grouping=0L, V1=5))
# order of cube results consistent to postgresql docs, #3179
set.seed(1)
d <- data.table(a = rep(1:2, each = 8),
b = rep(1:2, each = 4),
c = rep(1:2, each = 2),
val = sample(0:1, 16, replace = TRUE))
cb = cube(d, j= sum(val), by = c("a", "b", "c"), id=TRUE)
gs = groupingsets(d, j = sum(val), by = c("a", "b", "c"),
sets = list(c("a", "b", "c"),
c("a", "b" ),
c("a", "c"),
c("a" ),
c( "b", "c"),
c( "b" ),
c( "c"),
character()), id=TRUE)
test(1961, cb, gs)
# coverage tests
# tests 1962.001 and 1962.002 were testing now removed option datatable.old.unique.by.key; see NEWS items over 4 years
DT = data.table(x = c(1, 1, 3, 2), key = 'x')
test(1962.003, duplicated(DT, fromLast = NA),
error = 'must be TRUE or FALSE')
test(1962.004, duplicated(DT, by = -1L),
error = 'specify non existing column*.*-1')
test(1962.005, duplicated(DT, by = 'y'),
error = 'specify non existing column*.*y')
test(1962.0061, duplicated(data.table(NULL)), logical(0L))
test(1962.0062, duplicated(data.table(a = 1L), by = character()), logical())
test(1962.007, unique(DT, incomparables = TRUE),
error = 'not used (yet)')
test(1962.008, unique(DT, fromLast = TRUE),
data.table(x = c(1, 2, 3), key = 'x'))
test(1962.0091, anyDuplicated(DT, by = NULL), 2L)
test(1962.0092, uniqueN(DT, by = NULL), 3L)
## uniqlist.R
test(1962.010, uniqlist(1:5),
error = 'not type list')
test(1962.011, uniqlist(list()), list(0L))
## merge.R
DT1 = data.table(a = 1:3, V = 'a')
DT2 = data.table(a = 2:4, V = 'b')
test(1962.012, merge(DT1, DT2, sort = 1+3i),
error = 'should be logical TRUE/FALSE')
test(1962.013, merge(DT1, DT2, no.dups = 1+3i),
error = 'should be logical TRUE/FALSE')
setDF(DT2)
test(1962.014, merge(DT1, DT2),
data.table(a = integer(0), V = character(0)))
setkey(DT1, a)
test(1962.015, merge(DT1, DT2),
data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'))
test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')),
error = 'must be of same length')
test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
warning = 'Supplied both.*argument will be ignored')
test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
error = 'Elements listed in `by.x`')
test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
error = 'Elements listed in `by.y`')
test(1962.020, merge(DT1, DT2, by = character(0L)),
error = 'non-empty vector of column names')
test(1962.021, merge(DT1, DT2, by = 'z'),
error = 'must be valid column names in x and y')
## frank.R
x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)
test(1962.022, frankv(x, na.last = logical(0L)),
error = 'length(na.last) = 0')
test(1962.023, frankv(x, na.last = c(TRUE, FALSE)),
c(1.5, 1.5, 3, 7, 5.5, 4, 5.5, 9, 8),
warning = 'only the first element will be used')
test(1962.024, frankv(x, cols = 'y'),
error = 'x is a single vector')
test(1962.025, frankv(list(x), cols = integer(0L)),
error = "x is a list, 'cols' can not be 0-length")
f = frankv(list(x), ties.method = 'random')
test(1962.026,
length(f) == 9L && identical(f[c(3:4, 6L, 8:9)], c(3L, 7L, 4L, 9L, 8L)) &&
all(f[1:2] %in% c(1L, 2L)) && all(f[c(5L, 7L)] %in% c(5L, 6L)))
f = frankv(list(x), ties.method = 'random', na.last = NA)
test(1962.027,
length(f) == 8L && identical(f[c(3:4, 6L, 8L)], c(3L, 7L, 4L, 8L)) &&
all(f[1:2] %in% c(1L, 2L)) && all(f[c(5L, 7L)] %in% c(5L, 6L)))
test(1962.028, frank(x, cols = 'NULL'),
c(1.5, 1.5, 3, 7, 5.5, 4, 5.5, 9, 8))
test(1962.029, frank(data.table(x = x), +x),
c(1.5, 1.5, 3, 7, 5.5, 4, 5.5, 9, 8))
## setkey.R
DT = data.table(a = 3:1)
test(1962.030, setkey('DT', a),
error = 'x may no longer be the character name')
setDF(DT)
test(1962.031, setkeyv(DT, 'a'),
error = 'x is not a data.table')
setDT(DT)
test(1962.032, setkeyv(DT, 1L),
error = 'cols is not a character vector')
test(1962.033, setkeyv(DT, ''),
error = 'cols is the empty string')
test(1962.034, setkeyv(DT, c('a', '')),
error = 'cols contains some blanks')
setkey(DT, a)
test(1962.035, {setkeyv(DT, character(0L)); key(DT)}, NULL,
warning = 'cols is a character vector of zero length')
test(1962.036, any(grepl('already ordered', capture.output(setkey(DT, a, verbose = TRUE)))))
setnames(DT, '.xi')
setkey(DT, NULL)
test(1962.037, setkey(DT, .xi),
error = "x contains a column called '.xi'")
DT = data.table(a = as.raw(0))
test(1962.038, setkey(DT, a),
error = "Column 'a' is type 'raw'")
test(1962.039, is.sorted(3:1, by = 'x'),
error = 'x is vector but')
DT = data.table(a = 3:1)
test(1962.0401, forderv(DT, sort=FALSE, retGrp=FALSE), error='At least one of retGrp= or sort= must be TRUE')
test(1962.0402, forderv(DT, sort=0), error='sort must be TRUE or FALSE')
test(1962.0403, forderv(DT, retGrp=0), error='retGrp must be TRUE or FALSE')
test(1962.041, forderv(DT, na.last = logical(0L)), error='na.last must be logical TRUE, FALSE or NA of length 1')
test(1962.042, forderv(DT, na.last = c(TRUE, FALSE)), error='na.last must be logical TRUE, FALSE or NA of length 1')
test(1962.043, forderv(DT$a, by = 'a'), error='x is a single vector, non-NULL')
test(1962.044, forderv(DT$a, order = 2L), error='Item 1 of order (ascending/descending) is 2. Must be +1 or -1')
test(1962.045, forderv(DT$a, order = c(1L, -1L)), error='Input is an atomic vector (not a list of columns) but order= is not a length 1 integer')
test(1962.0461, forderv(DT, order = c(1L, -1L)), error="Either order= is not integer or its length (2) is different to by='s length (1)")
test(1962.0462, forderv(DT, order = 2L), error='Item 1 of order (ascending/descending) is 2. Must be +1 or -1')
test(1962.0471, forderv(mean), error="'x' argument must be data.table compatible")
test(1962.0472, forderv(DT, by=mean), error="argument specifying columns must be character or numeric")
test(1962.0473, forderv(NULL), error="DT is an empty list() of 0 columns")
setDF(DT)
test(1962.0481, forder(DT), 3:1)
L = as.list(DT)
test(1962.0482, forder(L), 3:1)
test(1962.0483, forder(), NULL)
setDT(DT)
test(1962.049, forder(DT[ , 0L]), error = 'Attempting to order a 0-column')
test(1962.050, forder(DT, decreasing = NA), error = 'isTRUEorFALSE(decreasing) is not TRUE')
test(1962.051, forder(DT, decreasing = 1.4), error = 'isTRUEorFALSE(decreasing) is not TRUE')
test(1962.052, forder(DT, NULL), 3:1)
test(1962.053, forder(DT), 3:1)
test(1962.054, forder(DT, ), 3:1)
test(1962.055, fsort(as.double(DT$a), internal = TRUE),
error = 'Internal code should not be being called on type double')
l = as.list(DT)
test(1962.056, setorder(l, a), error = 'x must be a data.frame or data.table')
test(1962.057, setorder(DT, NULL), data.table(a = 3:1))
test(1962.058, setorder(DT, +a), data.table(a = 1:3))
DT = data.table(a = 3:1)
test(1962.059, setorderv(DT, NULL), data.table(a = 3:1))
test(1962.060, setorderv(l, 'a'), error = 'x must be a data.frame or data.table')
test(1962.061, setorderv(DT, 5L),
error = 'cols is not a character vector')
test(1962.062, setorderv(DT, character(0L)), data.table(a = 3:1),
warning = 'cols is a character vector of zero length')
test(1962.063, setorderv(DT, c('a', '')), error = 'cols contains some blanks')
DT = data.table(a = c(1, 1, 2), b = c(2, 1, 3))
test(1962.064, setorderv(copy(DT)),
data.table(a = c(1, 1, 2), b = c(1, 2, 3)))
test(1962.065, setorderv(DT, 'c'), error = 'some columns are not in the data.table')
setnames(DT, 1L, '.xi')
test(1962.066, setorderv(DT, 'b'), error = "x contains a column called '.xi'")
test(1962.067, setorderv(data.table(a = as.raw(0)), 'a'),
error = "Column 'a' is type 'raw'")
DT = data.table(
color = c("yellow", "red", "green", "red", "green", "red",
"yellow", "yellow", "green", "green", "green", "yellow",
"red", "yellow", "red", "green", "yellow", "red", "yellow",
"red", "green", "yellow", "green", "green"),
year = structure(c(15340, 15340, 14975, 15706, 15706, 15340,
16436, 15340, 15340, 14975, 16436, 15706,
16436, 15340, 14975, 14975, 16071, 15340,
15706, 16071, 15706, 15340, 16436, 16071), class = "Date"),
status = structure(c(4L, 3L, 4L, 3L, 2L, 1L, 3L, 4L, 4L, 3L, 4L, 4L,
4L, 4L, 1L, 3L, 3L, 2L, 1L, 2L, 3L, 4L, 2L, 4L),
.Label = c("active", "archived", "inactive", "removed"),
class = "factor"),
amount = c(1L, 4L, 2L, 3L, 1L, 5L, 1L, 1L, 4L, 2L, 3L, 1L,
5L, 4L, 2L, 2L, 4L, 3L, 3L, 2L, 4L, 4L, 1L, 2L),
value = c(2.5, 2, 3, 3, 2.5, 3.5, 2.5, 3.5, 3, 2.5, 3.5, 2.5, 2,
2.5, 3, 3, 3, 3, 3, 3, 2, 2.5, 3, 3)
)
setDF(DT)
test(1962.068, rollup(DT), error = 'no applicable method')
test(1962.069, rollup.data.table(DT), error = 'must be a data.table object')
test(1962.070, cube(DT), error = 'no applicable method')
test(1962.071, cube.data.table(DT), error = 'must be a data.table object')
test(1962.072, groupingsets(DT), error = 'no applicable method')
test(1962.073, groupingsets.data.table(DT), error = 'must be a data.table object')
setDT(DT)
test(1962.074, rollup(DT, by = 3L), error = "'by' must be a character vector")
test(1962.075, rollup(DT, by = 'color', id = 3L), error = "'id' must be a logical scalar")
test(1962.076, rollup(DT, by = 3L), error = "'by' must be a character vector")
test(1962.077, rollup(DT, by = 'color', id = 3L), error = "'id' must be a logical scalar")
setnames(DT, 2L, 'color')
test(1962.078, groupingsets(DT, by = 'status', sets = 5L),
error = 'Input data.table must not contain duplicate')
setnames(DT, 2L, 'year')
test(1962.079, groupingsets(DT, by = logical(1L)),
error = "'by' must be a character vector")
test(1962.080, groupingsets(DT, by = 'color', sets = list(5L)),
error = "'sets' must be a list of character vectors")
test(1962.081, groupingsets(DT, by = 'color', sets = list('year', 'status'), id = 3L),
error = "'id' must be a logical scalar")
setnames(DT, 1L, 'grouping')
test(1962.082, groupingsets(DT, by = c('color', 'status'),
sets = list('color', 'status'), id = TRUE),
error = "data.table must not have a column named 'grouping'")
setnames(DT, 1L, 'color')
## fcast.R
DT = data.table('(all)' = 1)
test(1962.083, guess(DT), '(all)')
setnames(DT, 'V')
test(1962.084, guess(DT), 'V',
message = 'Using.*value column.*override')
setDF(DT)
test(1962.085, dcast.data.table(DT), error = 'must be a data.table')
setDT(DT)
test(1962.086, dcast(DT, a ~ a, drop = NA),
error = 'must be logical TRUE/FALSE')
DT = data.table(a = c(1, 1, 2, 2), b = list(1, 2, 3, 4), c = c(4, 4, 2, 2))
test(1962.087, dcast(DT, a ~ b, value.var = 'b'),
error = 'Columns specified in formula can not be of type list')
test(1962.088, dcast(DT[0L, ], a ~ c, value.var = 'b'),
error = 'Can not cast an empty data.table')
test(1962.089, dcast(DT, a ~ c, value.var = 'b'),
data.table(a = c(1, 2), `2` = c(0L, 2L), `4` = c(2L, 0L), key = 'a'),
message = 'Aggregate function missing')
## IDateTime.R
x = as.IDate('2018-08-01')
test(1962.090, `+`(x), x)
test(1962.091, all.equal(x+1.4, structure(17745.4, class = "Date")))
test(1962.092, x + x, error = 'binary + is not defined')
test(1962.093, `-`(x), error = 'unary - is not defined')
test(1962.094, all.equal(x-1.4, structure(17742.6, class = "Date")))
y = as.ITime('543210', format = '%S%M%H')
test(1962.095, y, structure(37974L, class = "ITime"))
test(1962.096, capture.output(print(y)), '[1] "10:32:54"')
test(1962.097, rep(y, 2L), structure(c(37974L, 37974L), class = "ITime"))
test(1962.098, as.POSIXlt(y, date = '2018-12-01', tz = 'UTC'),
structure(list(sec = 54, min = 32L, hour = 10L, mday = 1L, mon = 11L,
year = 118L, wday = 6L, yday = 334L, isdst = 0L),
class = c("POSIXlt", "POSIXt"), tzone = "UTC"))
test(1962.099, as.POSIXct(x, y),
structure(1533119574, tzone = "UTC", class = c("POSIXct", "POSIXt")))
test(1962.100, as.POSIXct(x, tz = ''),
structure(1533081600, tzone = "UTC", class = c("POSIXct", "POSIXt")))
z = .POSIXct(3725, 'UTC')
test(1962.101, second(z), 5L)
test(1962.102, minute(z), 2L)
test(1962.103, hour(z), 1L)
z = as.ITime(z)
test(1962.201, second(z), 5L)
test(1962.202, minute(z), 2L)
test(1962.203, hour(z), 1L)
# positive and negative values for shift, #1708
DT = data.table(x = 1:10, y = 10:1)
test(1963.01, shift(DT$x, -1), c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA))
test(1963.02, shift(DT$x, -1, type = 'lead'),
c(NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L))
test(1963.03, shift(DT$x, -1, fill = 0L),
c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 0L))
test(1963.04, shift(DT$x, -1, give.names = TRUE), # give.names is ignored because we do not return list
c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA))
test(1963.05, shift(DT$x, -1:1),
list(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA), 1:10,
c(NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L)))
test(1963.06, shift(DT, -1),
list(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA),
c(9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, NA)))
test(1963.07, shift(DT, -1:1),
list(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA), 1:10,
c(NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L),
c(9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, NA), 10:1,
c(NA, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L)))
## some coverage tests for good measure
test(1963.08, shift(DT$x, type = 'some_other_type'), error='should be one of.*lag.*lead')
test(1963.09, shift(as.raw(0:1)), error = 'Unsupported type')
test(1963.10, shift(DT, -1:1, type="shift", give.names = TRUE), # new type="shift" #3223
ans <- list(`x_shift_-1` = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA),
x_shift_0 = 1:10,
x_shift_1 = c(NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L),
`y_shift_-1` = c(9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, NA),
y_shift_0 = 10:1,
y_shift_1 = c(NA, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L)))
names(ans) <- c("x_lead_1", "x_lag_0", "x_lag_1", "y_lead_1", "y_lag_0", "y_lag_1")
test(1963.11, shift(DT, -1:1, type="lag", give.names = TRUE), ans)
test(1963.12, shift(DT, 1:-1, type="lead", give.names = TRUE), ans)
# more detailed tests for negative shift due to #3335
DT = data.table(a=1:5, b=as.double(1:5), c=c(TRUE,FALSE,FALSE,TRUE,TRUE), d=letters[1:5], e=as.list(1:5), f=factor(letters[1:5]))
if (test_bit64) DT[, "g" := as.integer64(1:5)]
test(1963.13, shift(DT, 1L, type="lag"), shift(DT, -1L, type="lead"))
test(1963.14, shift(DT, 3L, type="lag"), shift(DT, -3L, type="lead"))
test(1963.15, shift(DT, -1L, type="lag"), shift(DT, 1L, type="lead"))
test(1963.16, shift(DT, -3L, type="lag"), shift(DT, 3L, type="lead"))
# #3832 consistency rule for renaming lead_0 -> lag_0 doesn't apply unless mixing signs of n
DT <- data.table(a = 1:3, b = 2:4)
test(1963.17, DT[ , shift(.SD, 0:1, give.names = TRUE, type = "lead")],
data.table(a_lead_0 = 1:3, a_lead_1 = c(2L, 3L, NA), b_lead_0 = 2:4, b_lead_1 = c(3L, 4L, NA)))
# 0 column data.table should not have rownames, #3149
M0 = matrix(1:6, nrow=3, ncol=2, dimnames=list(rows=paste0("id",1:3), cols=c("v1","v2")))
M = M0[,integer(0)]
DT = as.data.table(M)
test(1964.1, rownames(DT), character(0))
test(1964.2, colnames(DT), character(0))
M = M0[integer(0),, drop=FALSE]
DT = as.data.table(M)
test(1964.3, rownames(DT), character(0))
test(1964.4, colnames(DT), c("v1","v2"))
test(1965, setDT(list(1, 1:2)), error = 'profile of input lengths') #3121
# fread/fwrite file name in native and utf-8 encoding, #3078
if (.Platform$OS.type=="windows") {
f = tempfile("\u00f6"); cat("3.14", file = f)
fn = enc2native(f); f8 = enc2utf8(f)
test(1966.1, fread(fn), data.table(V1=3.14))
test(1966.2, fread(f8), data.table(V1=3.14))
unlink(c(fn, f8))
DT = data.table("a"); pth = tempdir()
f = "\u00f6.csv"; fp = file.path(pth, f)
fpn = enc2native(fp); fp8 = enc2utf8(fp)
fwrite(DT, fpn)
test(1966.3, list.files(path = pth, pattern = "\\.csv$"), f)
unlink(c(fp, file.path(pth, "\u00c3\u00b6.csv")))
fwrite(DT, fp8)
test(1966.4, list.files(path = pth, pattern = "\\.csv$"), f)
unlink(c(fp, file.path(pth, "\u00c3\u00b6.csv")))
p = file.path(pth, "\u00fc"); dir.create(p); f = tempfile(tmpdir = p)
test(1966.5, fwrite(DT, enc2native(f)), NULL)
unlink(f)
test(1966.6, fwrite(DT, enc2utf8(f)), NULL)
unlink(p, recursive = TRUE)
}
# assorted coverage tests
## foverlaps.R
x = data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10)
y = data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3)
setkey(y, start, end)
setDF(y)
test(1967.01, foverlaps(x, y), error = 'y and x must both be data.tables')
setDT(y)
setkey(y, start, end)
setDF(x)
test(1967.02, foverlaps(x, y), error = 'y and x must both be data.tables')
setDT(x)
test(1967.03, foverlaps(x, y, maxgap = integer(0L)),
error = 'maxgap must be a non-negative integer')
test(1967.04, foverlaps(x, y, maxgap = c(3, 4)),
error = 'maxgap must be a non-negative integer')
test(1967.05, foverlaps(x, y, maxgap = NA),
error = 'maxgap must be a non-negative integer')
test(1967.06, foverlaps(x, y, maxgap = -5),
error = 'maxgap must be a non-negative integer')
test(1967.07, foverlaps(x, y, minoverlap = integer(0L)),
error = 'minoverlap must be a positive integer')
test(1967.08, foverlaps(x, y, minoverlap = c(3, 4)),
error = 'minoverlap must be a positive integer')
test(1967.09, foverlaps(x, y, minoverlap = NA),
error = 'minoverlap must be a positive integer')
test(1967.10, foverlaps(x, y, minoverlap = -5),
error = 'minoverlap must be a positive integer')
test(1967.11, foverlaps(x, y, which = integer(0L)),
error = 'which must be a logical vector')
test(1967.12, foverlaps(x, y, which = c(3, 4)),
error = 'which must be a logical vector')
test(1967.13, foverlaps(x, y, which = NA),
error = 'which must be a logical vector')
test(1967.14, foverlaps(x, y, nomatch = integer(0L)),
error = 'nomatch must either be NA or NULL')
test(1967.15, foverlaps(x, y, nomatch = c(3, 4)),
error = 'nomatch must either be NA or NULL')
test(1967.16, foverlaps(x, y, nomatch = 4L),
error = 'nomatch must either be NA or NULL')
test(1967.18, foverlaps(x, y, by.x = 'start'),
error = "by.x' and 'by.y' should contain at least two")
test(1967.19, foverlaps(x, y, by.y = 'start'),
error = "by.x' and 'by.y' should contain at least two")
test(1967.20, foverlaps(x, y, by.x = c(-1L, 0L)),
error = "Invalid numeric value for 'by.x'")
test(1967.21, foverlaps(x, y, by.x = c(1L, 100L)),
error = "Invalid numeric value for 'by.x'")
test(1967.22, foverlaps(x, y, by.y = c(-1L, 0L)),
error = "Invalid numeric value for 'by.y'")
test(1967.23, foverlaps(x, y, by.y = c(1L, 100L)),
error = "Invalid numeric value for 'by.y'")
test(1967.24, foverlaps(x, y, by.x = c(1 + 3i, 2 - 1i)),
error = 'non-empty vector of column names or numbers is required for by.x')
test(1967.25, foverlaps(x, y, by.y = c(1 + 3i, 2 - 1i)),
error = 'non-empty vector of column names or numbers is required for by.y')
test(1967.26, foverlaps(x, y, by.x = c('start', 'END')),
error = "Elements listed in 'by.x' must be valid names")
test(1967.27, foverlaps(x, y, by.x = c('start', 'start')),
error = 'Duplicate columns are not allowed')
setkey(y, start, start)
test(1967.28, foverlaps(x, y, by.y = c('start', 'start')),
error = 'Duplicate columns are not allowed')
setkey(y, start, end)
test(1967.29, foverlaps(x, y, by.x = c('start', 'end', 'val2')),
error = 'length(by.x) != length(by.y)')
x[ , end := as.character(end)]
test(1967.30, foverlaps(x, y),
error = 'must be integer/numeric type')
x[ , end := as.integer(end)]
test(1967.31, foverlaps(x, y, by.x = c('end', 'start')),
error = 'All entries in column end should be <= corresponding entries')
y[ , end := as.character(end)]
setkey(y, start, end)
test(1967.32, foverlaps(x, y),
error = 'must be integer/numeric type')
y[ , end := as.integer(end)]
setkey(y, end, start)
test(1967.33, foverlaps(x, y, by.x = c('start', 'end'), by.y = c('end', 'start')),
error = 'All entries in column end should be <= corresponding entries')
## data.table.R
test(1967.34, data.table(1:5, NULL), data.table(V1=1:5))
### testing branches:
### if (length(namesi)==0L) namesi = rep.int("",ncol(xi))
### if (any(tt)) namesi[tt] = paste0("V", which(tt))
### if (novname[i]) vnames[[i]] = namesi
### but, on pause for now pending #3193
### test(1967.35, data.table(1:5, matrix(6:15, nrow = 5L))
test(1967.35, data.table(1:5, integer(0L)), data.table(1:5, NA_integer_), warning="Item 2 has 0 rows but longest item has 5; filled with NA")
test(1967.36, data.table(1:5, key = 5L), error = 'must be character')
x = data.table(a = 1:5)
test(1967.37, x[3, mult = 'none'], error = 'mult argument can only be') # i==3 to get past 'i and j both missing' error
test(1967.38, x[3, roll = c(3, 4)], error = 'roll must be a single')
test(1967.39, x[3, roll = NA], error = 'roll must be a single')
test(1967.40, x[3, roll = 'furthest'], error = 'Only valid character value')
test(1967.41, x[3, rollends = 1 + 3i], error = 'rollends must be a logical')
test(1967.42, x[3, rollends = rep(TRUE, 10L)], error = 'rollends must be length 1 or 2')
test(1967.43, x[ , ..], error = 'symbol .. is invalid')
test(1967.44, x[NULL], data.table(NULL))
test(1967.45, x[ , NULL], NULL)
test(1967.46, x[ , 'b' := 6:10, with = FALSE],
data.table(a = 1:5, b = 6:10), warning = 'with=FALSE ignored')
test(1967.47, x[ , -1L, with = FALSE], data.table(b = 6:10))
test(1967.48, x[ , b, .SDcols = 'a'], 6:10,
warning = "This j doesn't use .SD")
test(1967.49, x[ , list(5) := 6], error = 'LHS of := must be a symbol')
test(1967.50, x[ , 1 + 3i := 6], error = "LHS of := isn't column names")
test(1967.511, x[ , .(5L), by = .EACHI, mult = 'all'], error='logical error. i is not data.table')
test(1967.512, x[1+3i], error='i has evaluated to type complex. Expecting logical, integer or double')
test(1967.521, x[1:2, by=a], x[1:2,], warning="Ignoring by= because j= is not supplied")
test(1967.522, x[, by=a], x, warning=c("Ignoring by= because j= is not supplied","i and j are both missing.*upgraded to error in future"))
test(1967.523, x[by=a], x, warning=c("Ignoring by= because j= is not supplied","i and j are both missing.*upgraded to error in future"))
test(1967.524, x[1:2, keyby=a], x[1:2,], warning="Ignoring keyby= because j= is not supplied")
test(1967.525, x[, keyby=a], x, warning=c("Ignoring keyby= because j= is not supplied","i and j are both missing.*upgraded to error in future"))
test(1967.526, x[keyby=a], x, warning=c("Ignoring keyby= because j= is not supplied","i and j are both missing.*upgraded to error in future"))
test(1967.53, as.matrix(x, rownames = 2:3),
error = 'length(rownames)==2 but')
test(1967.54, as.matrix(x[0L]),
structure(logical(0), .Dim = c(0L, 2L), .Dimnames = list(NULL, c("a", "b"))))
test(1967.55, subset(x, 5L), error = "'subset' must evaluate to logical")
x = as.list(x)
test(1967.56, setnames(x), error = 'x is not a data.table or data.frame')
setDT(x)
names(x) = NULL
test(1967.57, setnames(x), error = 'x has 2 columns but its names are length 0')
names(x) = c('a', 'b')
test(1967.58, names(setnames(x, new = c('b', 'c'))), c('b', 'c'))
test(1967.59, setnames(x, 1:2, c(8L, 9L)), error = "'new' is not a character")
test(1967.60, setnames(x, -1:1, c('hey', 'you')), error = "mixed.*negative")
test(1967.61, setnames(x, 1+3i, 'cplx'), error = "'old' is type complex")
test(1967.62, setnames(x, 1, c('d', 'e')), error = "'old' is length 1 but 'new'")
test(1967.621, setnames(x, 1:2, c("a","a")), data.table(a=1:5, a=6:10))
test(1967.622, setnames(x, 1:2, c("a",NA)), error = "NA in 'new' at positions [2]")
test(1967.63, setcolorder(x, c(1, 1)), error = 'Item 2 of order (1) is either NA, out of range [1,2], or is duplicated. The new order must be a strict permutation of 1:n')
test(1967.64, setcolorder(x, 1+3i), error = 'must be character or numeric')
test(1967.65, setcolorder(x, 300), error = 'specify non existing column*.*300')
test(1967.66, rbindlist(list(x), idcol = FALSE), rbindlist(list(x)))
test(1967.67, rbindlist(list(x), idcol = 1+3i), error = 'idcol must be a logical')
### potentially altering user environment so use a strange name
suppressWarnings(rm(`___data.table_internal_test_1967.68___`))
test(1967.68, setDT(`___data.table_internal_test_1967.68___`), error = 'Cannot find symbol')
### [.data.table verbosity & non-equi-join tests
options(datatable.optimize = 0L)
verbose_output = capture.output(x[order(a), .N, verbose = TRUE])
test(1967.69, !any(grepl('forder.c', verbose_output, fixed = TRUE)))
test(1967.70, any(grepl('[1] 5', verbose_output, fixed = TRUE)))
options('datatable.optimize' = 1L)
test(1967.71, x[order(a), .N, verbose = TRUE], 5L,
output = "forder.c received 5 rows and 1 column")
setkey(x)
test(1967.72, x[x, .N, on = 'a', verbose = TRUE], 5L,
output = "on= matches existing key")
options(datatable.optimize = Inf)
x = data.table(
i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L),
i4 = c(79L, 113L, 270L, -121L, 113L, 113L, -121L, 179L, -228L, 113L)
)
y = data.table(
i1 = c(29L, 18L, 33L, 27L, 33L, 23L, 39L, 38L, 29L, 23L),
i4 = c(-26L, 6L, -30L, -26L, -23L, 38L, -40L, -26L, -23L, 24L)
)
x[ , '_nqgrp_' := 5]
test(1967.73, x[y, on = .(i1 <= i1, i4 >= i4)], error = "'_nqgrp_' is reserved")
x[ , '_nqgrp_' := NULL]
test(1967.74, x[y, max(i4), on = .(i1 <= i1, i4 >= i4), verbose = TRUE], 38L,
output = 'Recomputing forder with non-equi.*done')
test(1967.75, x[!y, sum(i4), on = 'i1', by = .EACHI, verbose = TRUE],
data.table(i1 = c(169L, 369L), V1 = c(270L, 179L)),
output = "not-join called with 'by=.EACHI'.*done")
test(1967.76, x[!y, sum(i4), on = 'i1', verbose = TRUE], 510L,
output = 'Inverting irows for notjoin.*sec')
x[ , v := 0]
### hitting by = A:B branch
test(1967.77, x[ , .(v = sum(v)), by = i1:i4], x[-10L])
test(1967.78, x[1:5, sum(v), by = list(i5 = 1:5 %% 2L), verbose = TRUE],
data.table(i5 = 1:0, V1 = c(0, 0)), output = 'i clause present but columns used in by not detected')
# gforce integer overflow coerce to double
DT = data.table(A=1:5, B=-3i, C=2147483647L)
test(1968.2, storage.mode(DT$C), "integer")
test(1968.3, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(6442450941, 4294967294)),
warning="sum.*integer column.*more than type 'integer' can hold.*coerced to 'numeric'")
DT[3,C:=NA]
test(1968.4, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(NA, 4294967294)), warning="coerced to 'numeric'")
test(1968.5, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 4294967294)), warning="coerced to 'numeric'")
DT[4,C:=NA]
test(1968.6, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 2147483647)), warning="coerced to 'numeric'")
DT[2,C:=NA]
test(1968.7, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 0)), warning="coerced to 'numeric'")
# fsetequal and last col a character #2318
dt.1 <- data.table(Id=(1:10))
dt.2 <- data.table(Id=(1:10))
dt.2[1, Id:=99]
test(1969.1, fsetequal(dt.1, dt.2), FALSE)
dt.1[, Id := as.character(Id)]
dt.2[, Id := as.character(Id)]
test(1969.2, fsetequal(dt.1, dt.2), FALSE)
x = data.table(v = "foo", a = "my string")
y = data.table(v = "foo", a = "not my string")
test(1969.3, fsetequal(x, y), FALSE)
x = data.table(v = "foo", a = "my string")
y = data.table(v = "foo", a = "not my string")
x = rbind(x, x)
y = rbind(y, y)
test(1969.4, fsetequal(x, y), FALSE)
x = rbind(x, y)
y = rbind(y, x)
test(1969.5, fsetequal(x, y), FALSE)
# empty .SDcols, #3185 and comments in #3211
DT = data.table(x=1:3, y=4:6)
test(1970.1, DT[, .SD, .SDcols=integer(0L)], data.table(NULL))
test(1970.2, DT[, .SD, .SDcols=character(0L)], data.table(NULL))
test(1970.3, DT[, rowSums(.SD), .SDcols=integer()], numeric())
test(1970.4, DT[, rowSums(.SD), .SDcols=character()], numeric())
test(1970.5, DT[, z:=rowSums(.SD), .SDcols=integer()], data.table(x=1:3, y=4:6, z=NA_real_))
test(1970.6, DT[, z:=rowSums(.SD), .SDcols=integer()], error="RHS of assignment to existing column 'z' is zero length but not NULL")
test(1970.7, DT[, z:=NULL], data.table(x=1:3, y=4:6))
test(1970.8, DT[, z:=rowSums(.SD), .SDcols=character()], data.table(x=1:3, y=4:6, z=NA_real_))
test(1970.9, DT[, z:=rowSums(.SD), .SDcols=character()], error="RHS of assignment to existing column 'z' is zero length but not NULL")
# .SDcols=patterns(), #1878
DT = data.table(
i = 1:10,
c = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"),
V1 = c(0.4, -0.1, -1.1, -2.6, -0.1, -1.3, 0.3, -2.1, -0.6, 0.9),
V2 = c(-0.1, -2.5, -1, -0.1, -0.5, -0.7, -1, -2.1, 2.7, -1.2),
V3 = c(1.1, -1.6, 0.7, 1.6, -1.4, 1, -0.6, 1.2, -0.8, 0.1),
V4 = c(1.3, -0.8, 2.3, -0.7, 0.5, 0.5, 0.2, 0.7, -1.4, 0.8),
V5 = c(-0.1, -0.5, 1.5, -0.5, 1.9, 0.2, -0.1, -0.7, -1.7, -0.9),
V6 = c(0.8, -1.3, -0.7, -0.3, 1.4, 0.7, 0.4, 0.3, -1.6, -1.3),
V7 = c(-0.1, 0.8, 0.7, -0.2, -2, 0.5, 0.4, -0.2, -1.2, -0.7),
V8 = c(0.7, -1, 1.3, 0.5, 0.2, 0.8, 0.6, -1.4, -2, -0.1),
V9 = c(0.2, -0.1, 1.2, -0.5, 1.4, 1, 0.2, 0.7, 0.4, 1.6),
V10 = c(0.8, 0.7, -1.2, -0.9, -0.6, 0.4, -2.3, 2.2, 0.5, -1.4)
)
test(1971.1, DT[ , lapply(.SD, sum), .SDcols = patterns('^V')],
data.table(V1=-6.3, V2=-6.5, V3=1.3, V4=3.4, V5=-0.9, V6=-1.6, V7=-2, V8=-0.4, V9=6.1, V10=-1.8))
# multiple pattens --> intersection of patterns
test(1971.2, DT[ , lapply(.SD, sum), .SDcols = patterns('^V[02468]', '^V[48]')],
data.table(V4=3.4, V8=-0.4))
# also with !/- inversion
test(1971.3, DT[ , lapply(.SD, sum), .SDcols = !patterns('^c|i')],
data.table(V1=-6.3, V2=-6.5, V3=1.3, V4=3.4, V5=-0.9, V6=-1.6, V7=-2, V8=-0.4, V9=6.1, V10=-1.8))
# split.data.table now preserves attributes #2047
dt = data.table(x=1:10, y=rep(1:2,each=5))
setattr(dt, "a", 5)
test(1972.1, attr(split(dt,rep(1:2,each=5))[[1]],"a"), 5)
test(1972.2, attr(split(dt, by="y")[[1]],"a"), 5)
# select columns from i table; regression in dev caught by revdep testing, #3233
DT = data.table(a=1:3, b=4:6, key="a")
K = data.table(a=2:3, FOO=9L, BAR=12L)
test(1973.1, DT[K, "FOO"], data.table(FOO=c(9L,9L)))
test(1973.2, DT[K, "FOO", with=FALSE], data.table(FOO=c(9L,9L)))
var = "b"
test(1973.3, DT[K, c(var, "FOO")], c("b","FOO"))
test(1973.4, DT[K, c(..var, "FOO")], ans<-data.table(b=5:6, FOO=9L))
test(1973.5, DT[K, c(var, "FOO"), with=FALSE], ans)
# no error when j is supplied but inherits missingness from caller
DT = data.table(a=1:3, b=4:6)
f = function(cols) DT[,cols]
test(1974.1, f(), output="a.*b.*3:.*6")
f = function(cols) DT[,cols,with=FALSE]
test(1974.2, f(), output="a.*b.*3:.*6")
# na.rm=TRUE should remove NaN; a regression caught in dev 1.11.9 before release
DT = data.table(id=INT(1,1,2,2,2), v=c(1.1,2.2,3.3,NaN,3.4))
test(1975.1, DT[,sum(v),by=id], data.table(id=INT(1,2), V1=c(3.3,NaN)))
test(1975.2, DT[,sum(v,na.rm=TRUE),by=id], data.table(id=INT(1,2), V1=c(3.3,6.7)))
test(1975.3, DT[,mean(v),by=id], data.table(id=INT(1,2), V1=c(1.65,NaN)))
test(1975.4, DT[,mean(v,na.rm=TRUE),by=id], data.table(id=INT(1,2), V1=c(1.65,3.35)))
# rownames of .SD when a symbol in {} masks another column name
DT = data.table(ID=INT(1,1,2,2,2), FOO=1:5, BAR=1:5)
test(1976.1, DT[, length(rownames(.SD)), by=ID, .SDcols="FOO"], data.table(ID=1:2, V1=2:3)) # ok
test(1976.2, DT[, {BA=1; length(rownames(.SD))}, by=ID, .SDcols="FOO"], data.table(ID=1:2, V1=2:3)) # ok
test(1976.3, DT[, {BAR=1; length(rownames(.SD))}, by=ID, .SDcols="FOO"], data.table(ID=1:2, V1=2:3)) # BAR masks column name which affected rownames(.SD) in dev 1.11.9
# NA in join and remove character names too, #3245
DT = data.table(ID=rep(c("A","B","C"),each=2), GRP=rep(1:3,each=2), X=c(1.1,2.2,3.3,4.4,5.5,6.6), key="ID")
test(1977.1, DT["A"], data.table(ID="A", GRP=1L, X=c(1.1,2.2), key="ID"))
test(1977.2, DT["A", -"GRP"], data.table(ID="A", X=c(1.1,2.2), key="ID"))
test(1977.3, DT["D"], data.table(ID="D", GRP=NA_integer_, X=NA_real_, key="ID"))
test(1977.4, DT["D", -"GRP"], data.table(ID="D", X=NA_real_, key="ID"))
test(1977.5, DT["D", c("ID","GRP")], data.table(ID="D", GRP=NA_integer_, key="ID"))
test(1977.6, DT[c("A","D"), c("ID","GRP")], data.table(ID=c("A","A","D"), GRP=INT(1,1,NA)))
# catch malformed factor in rbindlist, #3315
set.seed(32940)
NN=7e5; KK=4e4; TT=25
DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) )
test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites
# Drop Null Values from `j` list elements #1406
DT = data.table(a = 1:3,b = letters[1:3],c = LETTERS[1:3])
test(1979, DT[,.(a,b,if(FALSE)c)], DT[,c("a","b")])
# array name is not recognized when constructing a data table #789
x <- as.array(1:5)
test(1980, names(data.table(x)), "x")
# crash when n="lead", #3354
DT = data.table( id = 1:5 , val = letters[1:5] )
test(1981.1, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE")
test(1981.2, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA")
# print of DT with many columns reordered them, #3306.
DT = as.data.table(lapply(1:255, function(i)rep.int(i, 105L))) # 105 to be enough for 'top 5 ... bottom 5' to print
out = capture.output(print(DT))
tt = out[grep("V",out)]
tt = unlist(strsplit(gsub(" ","",tt), "V"))
test(1982.1, tt[1L], "")
tt = as.integer(tt[tt!=""])
test(1982.2, tt, seq_along(tt))
# parse(text = 'list(`\\phantom{.}`)') fails, #3319
DT <- data.table(x=1, y=1:5)
setnames(DT, "x", "\\phantom{.}")
test(1983.1, DT[, .(y=mean(y)), keyby="\\phantom{.}"], data.table(`\\phantom{.}`=1, y=3, key='\\phantom{.}'))
# keyby = 'x y' fails, #3378
DT <- data.table(x=1:5, y=letters[1:5])
setnames(DT, "x", "x y")
test(1983.2, DT[, y:="j", keyby="x y"], data.table(`x y`=1:5, y='j', key='x y'))
# more coverage tests
DT = data.table(a = 1:10)
test(1984.01, DT[NULL], data.table(NULL))
test(1984.02, DT[DT, on = .(a > a), roll = TRUE], error='roll is not implemented for non-equi joins yet')
DT[ , b := 10:1]
test(1984.03, DT[ , -2L, with = FALSE], DT[ , .(a)])
test(1984.04, DT[ , mean(b), by = eval(expression(a %% 2))], data.table(expression = c(1, 0), V1 = c(6, 5)))
DT[ , c := 1:10]
setindex(DT, c)
test(1984.05, DT[ , sum(b), keyby = c, verbose = TRUE],
data.table(c = 1:10, V1 = 10:1, key = 'c'),
output = "by index 'c' but that index has 0 length")
### hitting byval = eval(bysub, setattr(as.list(seq_along(xss)), ...)
test(1984.06, DT[1:3, sum(a), by=b:c], data.table(b=10:8, c=1:3, V1=1:3))
test(1984.07, DT[, sum(a), by=call('sin',pi)], error='must evaluate to a vector or a list of vectors')
test(1984.08, DT[, sum(a), by=as.raw(0)], error='column or expression.*type raw')
test(1984.09, DT[, sum(a), by=.(1,1:2)], error='The items.*list are length[(]s[)] [(]1,2[)].*Each must be length 10; .*rows in x.*after subsetting')
options('datatable.optimize' = Inf)
test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE],
data.table(a = c(1, 0), V1 = c(1, 1)),
output = 'Optimization is on but left j unchanged')
DT[ , f := rep(1:2, each = 5)]
test(1984.11, DT[ , g:=sum(a), keyby=f, verbose=TRUE][,sum(g)], 275L, output='setkey() after the := with keyby=')
test(1984.12, as.matrix(DT, rownames=character(0L)), error='length(rownames)==0 but should be')
test(1984.13, DT[matrix(1, 1), 1] <- 4, error='When i is a matrix in DT[i]<-value syntax')
test(1984.14, DT[1, list(2)] <- 4, error='j must be an atomic vector')
test(1984.15, DT[1, NA] <- 4, error='NA in j')
test(1984.16, DT[1, 1+3i] <- 4, error='j must be vector of')
test(1984.17, dimnames(DT) <- 5, error = 'attempting to assign invalid object')
test(1984.18, dimnames(DT) <- list(5, 5, 5), error = 'attempting to assign invalid object')
test(1984.19, dimnames(DT) <- list(5, 5), error = 'data.tables do not have rownames')
test(1984.20, dimnames(DT) <- list(NULL, 5), error = "Can't assign 1 colnames")
dimnames(DT) <- list(NULL, 1:5)
test(1984.21, names(DT), paste0(1:5))
DT = data.table(a = 1:10)
test(1984.22, na.omit(DT, invert = 'a'), error="'invert' must be logical")
test(1984.23, na.omit(DT, cols = 'b'), error="specify non existing column*.*b")
#test(1984.24, na.omit(DT, cols = c('b', 'c')), error="Columns [b, c] don't") # only first non-existing col is now reported for efficiency
### idcol = TRUE behavior of rbindlist
test(1984.25, rbindlist(list(DT[1L], DT[2L]), idcol = TRUE), data.table(.id=1:2, a=1:2))
test(1984.26, setalloccol(`*tmp*`), error='setalloccol attempting to modify `*tmp*`')
DF = as.data.frame(DT)
test(1984.27, shallow(DF), error='x is not a data.table')
test(1984.28, split.data.table(DF), error='argument must be a data.table')
test(1984.29, split(DT, by='a', f='a'), error="passing 'f' argument together with 'by' is not allowed")
test(1984.30, split(DT), error="Either 'by' or 'f' argument must be supplied")
setnames(DT, '.ll.tech.split')
test(1984.31, split(DT, by = '.ll.tech.split'), error="Column '.ll.tech.split' is reserved")
setnames(DT, '.nm.tech.split')
test(1984.32, split(DT, by = '.nm.tech.split'), error="Column '.nm.tech.split' is reserved")
test(1984.33, split(DT, by = 'a'), error="Argument 'by' must refer to column names in x")
setnames(DT, 'a')
test(1984.34, split(DT[1:3], by = 'a', verbose = TRUE),
list(`1` = data.table(a = 1L),
`2` = data.table(a = 2L),
`3` = data.table(a = 3L)),
output = 'Processing split.data.table with')
### partially-named behavior on setDF
l = list(a = 1:3, 4:6)
test(1984.35, setDF(l), data.frame(a=1:3, V1=4:6))
## setDT on sub-environment
e = new.env()
e$DF = data.frame(a = 5)
test(1984.36, class(setDT(e$DF))[1L], 'data.table')
test(1984.37, class(setDT(l$df))[1L], error='not found in names of input list')
## rowid/rowidv
DT = data.table(a = c(1:3, 3:1))
test(1984.38, rowidv(DT, prefix = 5L), error='must be NULL or a character vector')
test(1984.39, rowidv(DT, prefix = c('hey','you')), error='must be NULL or a character vector')
# Test for #3349, foverlaps returned spurious results with POSIXct objects < 1970-01-01
x <- data.table(val=178.41,s=as.POSIXct("1968-04-25 04:20:00"),e=as.POSIXct("1968-04-25 04:20:00"))
y <- data.table(event="#1",s=as.POSIXct("1968-04-19 15:20:00"),e=as.POSIXct("1968-04-24 07:20:00"))
setkey(y, s, e)
# x$s and x$e are identical (i.e., range is actually a point, but that's okay). The point is to ensure that 'x' is not within 'y'. In older versions, this will be a match because of 'incr' value being 1 + dt_eps() instead of 1-dt_eps() (because the date here is < 1970-01-01 which is a -ve numeric value internally). 1+dt_eps() should be only for +ve numerics.
test(1985.1, nrow(foverlaps(x, y, by.x=c("s", "e"), type="within", nomatch=0L)), 0L)
# colMeans grouped on empty data.table
DT = as.data.table(mtcars)[0L]
test(1986.1, DT[, colMeans(.SD), by=gear], data.table(gear=numeric(), V1=numeric()))
test(1986.2, DT[, as.list(colMeans(.SD)), by=gear], cbind(DT[,"gear"],DT[,-"gear"]))
DT = as.data.table(mtcars)[1]
test(1986.3, DT[, colMeans(.SD), by=gear], data.table(gear=4, V1=c(21,6,160,110,3.9,2.62,16.46,0,1,4)))
test(1986.4, DT[, as.list(colMeans(.SD)), by=gear], cbind(DT[,"gear"],DT[,-"gear"]))
# tests for #2949, #1974 and #1369 - dcast not able to handle functions referred to by a variable
dt = data.table(
x=sample(5,20,TRUE),
y=sample(2,20,TRUE),
z=sample(letters[1:2], 20,TRUE),
d1 = runif(20),
d2=1L
)
myFun1 <- function(data, vars) {
mySum <- function(x) sum(x)
dcast.data.table(data, "x + y ~ z", value.var=vars, fun.aggregate=mySum)
}
myFun2 <- function(data, vars) {
myFuns <- list(f1=sum, first=function(x) x[1L])
dcast.data.table(data, "x + y ~ z", value.var=vars, fun.aggregate=myFuns)
}
funs = list(sum, mean)
vars = list("d1", "d2")
test(1987.1, names(dcast.data.table(dt, x + y ~ z, fun.aggregate=funs, value.var=vars)),
c("x", "y", "d1_fun1_a", "d1_fun1_b", "d2_fun2_a", "d2_fun2_b"))
test(1987.2, dcast.data.table(dt, x + y ~ z, fun.aggregate=sum, value.var=vars[[1]]),
myFun1(dt, vars[[1]]))
test(1987.3, dcast.data.table(dt, x + y ~ z, fun.aggregate=list(f1=sum, first=function(x) x[1L]), value.var=vars), myFun2(dt, vars))
# testing frankv/forder behavior with NA/NaN; earlier tests compare consistency with base::rank,
# but we intentionally break from base w.r.t. ranking NAs (we consider NAs to be tied, ditto NaN)
x = data.table(r = c(6, 4, 2, NA, 1, NaN, 5, NaN, 9, 10, NA))
## frankv
test(1988.1, frankv(x, cols='r', order=1L, ties.method='average'), c(5, 3, 2, 10.5, 1, 8.5, 4, 8.5, 6, 7, 10.5))
test(1988.2, frankv(x, cols='r', order=1L, ties.method='max'), c(5L, 3L, 2L, 11L, 1L, 9L, 4L, 9L, 6L, 7L, 11L))
test(1988.3, frankv(x, cols='r', order=1L, ties.method='min'), c(5L, 3L, 2L, 10L, 1L, 8L, 4L, 8L, 6L, 7L, 10L))
test(1988.4, frankv(x, cols='r', order=1L, ties.method='dense'), c(5L, 3L, 2L, 9L, 1L, 8L, 4L, 8L, 6L, 7L, 9L))
## forderv
test(1988.5, forderv(x, by='r', order=1L, na.last=FALSE), c(4L, 11L, 6L, 8L, 5L, 3L, 2L, 7L, 1L, 9L, 10L))
test(1988.6, forderv(x, by='r', order=-1L, na.last=FALSE), c(4L, 11L, 6L, 8L, 10L, 9L, 1L, 7L, 2L, 3L, 5L))
test(1988.7, forderv(x, by='r', order=1L, na.last=TRUE), c(5L, 3L, 2L, 7L, 1L, 9L, 10L, 6L, 8L, 4L, 11L))
test(1988.8, forderv(x, by='r', order=-1L, na.last=TRUE), c(10L, 9L, 1L, 7L, 2L, 3L, 5L, 6L, 8L, 4L, 11L))
# Test should not segfault, #3401 fix:
set.seed(1L)
foo <- function(n) apply(matrix(sample(letters, 4*n, TRUE), ncol=4L), 1, paste, collapse="")
dates <- sample(as.Date("2015-01-01"), as.Date("2018-12-31"), 300L)
codes <- foo(2000)
DT1 <- data.table(
date=sample(dates[1:100], 1e4, TRUE),
flight=sample(codes[1:1300], 1e4, TRUE),
val=runif(1e4)
)
DT2 <- data.table(
start=sample(dates[80:250], 1e4, TRUE),
end=sample(dates[100:300], 1e4, TRUE),
flight=sample(codes[1200:2000], 1e4, TRUE)
)
DT2[, c("start", "end") := .(pmin(start, end), pmax(start, end))]
# just testing if the code runs without segfault ..
test(1989.1, nrow(DT1[DT2, on=.(date <= end, date >= start, flight==flight)]) > 0L, TRUE)
# fix for #2202, dcast needs to rank NA with na.last=FALSE in frankv within dcast
x = data.table(
f1 = structure(c(1L, 1L, 1L, 1L, NA, NA), .Label = "123456", class = "factor"),
f2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "U", class = "factor"),
v1 = c(0,300, 600, 500, 0, 800),
v2 = c(0,15, 50, 30, 0, 50))
y = x[, lapply(.SD, sum), by=.(f1,f2)]
z = dcast.data.table(data = y, f1 ~ f2, value.var = c("v1", "v2"))
test(1990.1, z$v1_U, c(800, 1400)) # shouldn't be 1400, 800 ...
test(1990.2, forderv(z$v1_U), integer(0)) # is already sorted ...
# foverlaps error message fix, #2645
x <- data.table(id=1, start=1, end=5, start_f=factor(1))
y <- data.table(id=2, start=2, end=6, start_f=factor(2))
setkey(x, start_f, end)
setkey(y, start, end)
test(1991.1, foverlaps(x, y), error="must be integer/numeric type")
setkey(x, start, end)
setkey(y, start_f, end)
test(1991.2, foverlaps(x, y), error="must be integer/numeric type")
# better error message foverlaps, #3007
DT_x <- data.table(x1 = c(1,7,11,20), x2 = c(2,8,NA,22), xval = c("x_a","x_b","x_c","x_d"), key = c("x1","x2"))
DT_y <- data.table(y1 = c(1,10), y2 = c(9,50), yval = c("y_a","y_b"), key = c("y1","y2"))
test(1992.1, foverlaps(DT_x, DT_y), error="All rows with NA values")
# foverlaps POSIXct checks #1143 + another check...
xp <- data.frame(year = c(2006L, 2006L, 2006L), day = c(361L, 361L, 360L),
hour = c(14L, 8L, 8L), min = c(30L, 0L, 30L), val = c(0.5, 0.3, 0.4),
Date = structure(c(1167229800, 1167206400, 1167121800),
class = c("POSIXct", "POSIXt"), tzone = "UTC")) ## "UTC" time zone
setDT(xp)[, `:=`(start = Date - 1800L, end = Date + 1800L)]
tt <- as.POSIXct(c("2006/12/27 14:23:59", "2006/12/27 16:47:59", "2006/12/27 19:12:00"), format = "%Y/%m/%d %T", tz = "Asia/Jerusalem") ## different time zone
yp <- data.table(start = tt, end = tt, key=c("start", "end"))
test(1993.1, foverlaps(xp, yp, nomatch = 0L, which=TRUE), data.table(xid=1L, yid=2L), warning="POSIXct interval cols have mixed timezones")
test(1993.2, foverlaps(xp, yp, by.x=c("day", "year")), error="Some interval cols are of type POSIXct while others are not")
# forderv NaN,Inf and Inf when at most 1 finite value is present, #3381. These broke in v1.12.0. They pass in v1.11.8.
test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L))
test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L))
test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L))
test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L))
test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L))
test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L))
test(1994.7, data.table(A=c(-Inf,21,Inf),V=1:3)[,sum(V),by=A]$V1, 1:3)
# 0 length items should not result in no-recycle error, #3386
DT = fread("symbol,year,quarter,price
A1,2017,1,10.0
A1,2017,2,11.0
A1,2017,3,12.0
A1,2017,4,11.0
A1,2018,1,12.0
A1,2018,2,13.0
A2,2017,1,10.0
A2,2017,2,11.0
A2,2017,3,12.0
A2,2017,4,11.0
A2,2018,1,12.0")
test(1995.1, DT[, pQ4 := price[quarter==4L], by=.(symbol,year)]$pQ4, rep(c(11,NA,11,NA), c(4,2,4,1)))
DT = fread("symbol,date,price,volume
A1,20180102,10.0,0
A1,20180103,10.0,0
A1,20180104,11.0,100
A2,20180102,5.0,0
A2,20180103,5.0,0
A2,20180104,5.0,0")
test(1995.2, DT[, p1:=first(price[volume>0]), by=symbol]$p1, rep(c(11,NA), c(3,3)))
# quoted `:=` expression did not replace dot with list, #3425
d = data.table(a=1L)
qcall = quote(b := .(2L))
test(1996.1, d[, eval(qcall)], data.table(a=1L, b=2L))
qcall = quote(b := .(.Primitive("sum")(1, 2))) # such calls can be issued by dcast
d = data.table(a=1L)
test(1996.2, d[, eval(qcall)], data.table(a=1L, b=3))
# setDTthreads; #3435
test(1997.01, setDTthreads(NULL, percent=75), error="Provide either threads= or percent= but not both")
test(1997.02, setDTthreads(1L, percent=75), error="Provide either threads= or percent= but not both")
test(1997.03, setDTthreads(-1L), error="threads= must be either NULL or a single number >= 0")
test(1997.04, setDTthreads(percent=101), error="should be a number between 2 and 100")
test(1997.05, setDTthreads(percent=1), error="should be a number between 2 and 100")
test(1997.06, setDTthreads(percent=NULL), error="but is length 0")
test(1997.07, setDTthreads(percent=1:2), error="but is length 2")
test(1997.08, setDTthreads(restore_after_fork=21), error="must be TRUE, FALSE, or NULL")
old = getDTthreads() # (1)
oldenv1 = Sys.getenv("R_DATATABLE_NUM_PROCS_PERCENT")
oldenv2 = Sys.getenv("R_DATATABLE_NUM_THREADS")
Sys.setenv(R_DATATABLE_NUM_THREADS="") # in case user has this set, so we can test PROCS_PERCENT
Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="3.0")
test(1997.09, setDTthreads(), old, warning="Ignoring invalid.*Please remove any.*not a digit")
new = getDTthreads() # old above at (1) may not have been default. new now is.
test(1997.10, getDTthreads(), new)
Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="1")
test(1997.11, setDTthreads(), new, warning="Ignoring invalid.*integer between 2 and 100")
test(1997.12, getDTthreads(), new)
Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75")
test(1997.13, setDTthreads(), new)
new = getDTthreads()
setDTthreads(percent=75)
test(1997.14, getDTthreads(), new)
Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="100")
setDTthreads()
allcpu = getDTthreads()
Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75")
Sys.setenv(R_DATATABLE_NUM_THREADS=allcpu)
setDTthreads()
test(1997.15, getDTthreads(), allcpu)
Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT=oldenv1)
Sys.setenv(R_DATATABLE_NUM_THREADS=oldenv2)
test(1997.16, setDTthreads(old), allcpu)
test(1997.17, getDTthreads(), old)
test(1997.18, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1")
setDTthreads(throttle=65536)
test(1997.19, getDTthreads(TRUE), output="throttle==65536")
setDTthreads(throttle=1024)
# test that a copy is being made and output is printed, #3385 after partial revert of #3281
x = 5L
test(1998.1, as.IDate(x), output = '1970-01-06')
test(1998.2, class(x), 'integer')
# a single NA at the beginning with no other nomatch would cause incorrect key, #3441
dx = data.table(id = "A", key = "id")
di = list(c("D", "A"))
test(1999.1, key(dx[di]), NULL)
dx = data.table(id = 1L, key = "id")
di = list(z=c(2L, 1L))
test(1999.2, key(dx[di]), NULL)
# chmatchdup test from benchmark at the bottom of chmatch.c
set.seed(45L)
x = sample(letters, 1e5, TRUE)
y = sample(letters, 1e6, TRUE)
test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406))
rm(list=c("x","y"))
# rbindlist use.names=TRUE returned random column order when ncol>255; #3373
DT = setDT(replicate(300, rnorm(3L), simplify = FALSE))
test(2001.1, colnames(rbind(DT[1], DT[3])), colnames(DT))
# and use.names=TRUE keeps dups in original location; mentioned in #3373
DT1 = data.table(a=1L, b=3L, c=5L, b=7L)
DT2 = data.table(a=2L, b=4L, c=6L, b=8L)
test(2001.2, rbind(DT1, DT2, use.names = TRUE), data.table(a=1:2, b=3:4, c=5:6, b=7:8)) # dup of b at the end; was a,b,b,c
# rbindlist now fills NULL and empty columns with NA with warning, #1871
test(2002.01, rbindlist( list(list(a=1L, b=2L, x=NULL), list(a=2L, b=3L, x=10L)) ),
data.table(a=1:2, b=2:3, x=INT(NA,10)),
warning="Column 3 ['x'] of item 1 is length 0. This (and 0 others like it) has been filled with NA (NULL for list columns) to make each item uniform.")
test(2002.02, rbindlist( list(list(a=1L, b=2L, x=NULL), list(a=2L, b=NULL, x=10L)) ),
data.table(a=1:2, b=INT(2,NA), x=INT(NA,10)),
warning="Column 3 ['x'] of item 1 is length 0. This (and 1 other like it) has been filled with NA (NULL for list columns) to make each item uniform.")
test(2002.03, rbindlist( list(list(a=1L, b=2L, x=NULL), list(a=2L, b=NULL, x=NULL)) ),
data.table(a=1:2, b=INT(2,NA), x=c(NA,NA)),
warning="Column 3 ['x'] of item 1 is length 0. This (and 2 others like it) has been filled with NA (NULL for list columns) to make each item uniform.")
# tests from #1302
test(2002.04, rbindlist( list(list(a=1L,z=list()), list(a=2L, z=list("m"))) ),
data.table(a=1:2, z=list(NULL, "m")),
warning="Column 2 ['z'] of item 1 is length 0. This (and 0 others like it) has been filled with NA")
test(2002.05, rbindlist( list( list(a=1L, z=list("z")), list(a=2L, z=list(c("a","b"))) )),
data.table(a=1:2, z=list("z", c("a","b"))))
test(2002.06, rbindlist( list( list(a=1:2, z=list("z",1,"k")), list(a=2, z=list(c("a","b"))) )),
error="Column 1 of item 1 is length 2 inconsistent with column 2 which is length 3. Only length-1 columns are recycled.")
test(2002.07, rbindlist( list(list(a=1L, z=list(list())), list(a=2L, z=list(list("m")))) ),
data.table(a=1:2, z=list(list(),list("m"))))
test(2002.08, rbindlist( list(list(a=1L, z=list(list("z"))), list(a=2L, z=list(list(c("a","b"))))) ),
data.table(a=1:2, z=list(list("z"), list(c("a","b")))))
test(2002.09, rbindlist( list(list(a=1L, z=list(list("z",1))), list(a=2L, z=list(list(c("a","b"))))) ),
data.table(a=1:2, z=list(list("z",1), list(c("a","b")))))
# tests from #3343
DT1=list(a=NULL); setDT(DT1)
DT2=list(a=NULL); setDT(DT2)
test(2002.10, rbind(DT1, DT2), data.table(a=logical()))
test(2002.11, rbind(A=DT1, B=DT2, idcol='id'), data.table(id=character(), a=logical()))
test(2002.12, rbind(DT1, DT2, idcol='id'), data.table(id=integer(), a=logical()))
#rbindlist coverage
test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]")
test(2003.2, rbindlist(list(), fill=1), error="fill= should be TRUE or FALSE")
test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, use.names=FALSE),
data.table(a=c(1:2,NA,NA), b=c(NA,NA,3:4)),
warning="use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE")
# chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111
x1 = "fa\xE7ile"
Encoding(x1) = "latin1"
x2 = iconv(x1, "latin1", "UTF-8")
test(2004.1, identical(x1,x2))
test(2004.2, Encoding(x1)!=Encoding(x2))
test(2004.3, chmatch(c("a",x1,"b"), x2), c(NA,1L,NA)) # x contains mixed; covers first fallback in chmatchMain
test(2004.4, c("a",x1,"b") %chin% x2, c(FALSE,TRUE,FALSE)) # and the chin switch in the same fallback
test(2004.5, chmatch(c("a","b"), c("b",x1)), c(NA,1L)) # x doesn't contain encodings so covers the second fallback in chmatchMain
test(2004.6, chmatch(c("a","b"), c("b",x2)), c(NA,1L)) # the second fallback might be redundnant though; see comments in chmatch.c
test(2004.7, c("a","b") %in% c("b",x1,x2), c(FALSE, TRUE)) # the second fallback might be redundnant though; see comments in chmatch.c
# more coverage ...
test(2005.01, truelength(NULL), 0L)
DT = data.table(a=1:3, b=4:6, c=as.raw(7:9), d=as.logical(c(1,0,1)), e=pi*1:3, f=as.complex(10:12))
test(2005.02, set(DT, 4L, "b", NA), error="i[1] is 4 which is out of range [1,nrow=3]")
test(2005.03, set(DT, 3L, 8i, NA), error="j is type 'complex'. Must be integer, character, or numeric is coerced with warning.")
test(2005.04, set(DT, 1L, 2L, expression(x+2)), error="type 'expression' cannot be coerced to 'integer'") # similar to R's error for as.integer(expression(x+2))
DT[,foo:=factor(c("a","b","c"))]
test(2005.05, DT[2, foo:=8i], error="Can't assign to column 'foo' (type 'factor') a value of type 'complex' (not character, factor, integer or numeric)")
test(2005.06, DT[2, a:=9, verbose=TRUE], notOutput="Coerced")
test(2005.07, DT[2, a:=NA, verbose=TRUE], notOutput="Coerced")
test(2005.08, DT[2, a:=9.9]$a, INT(1,9,3), warning="9.9.*double.*position 1 truncated.*integer.*column 1 named 'a'")
test(2005.09, set(DT, 1L, "c", expression(x+2)), error="type 'expression' cannot be coerced to 'raw'")
test(2005.10, set(DT, 1L, "d", expression(x+2)), error="type 'expression' cannot be coerced to 'logical'")
test(2005.11, set(DT, 1L, "e", expression(x+2)), error="type 'expression' cannot be coerced to 'double'")
test(2005.12, set(DT, 1L, "f", expression(x+2)), error="type 'expression' cannot be coerced to 'complex'")
test(2005.30, DT[2:3,c:=c(TRUE,FALSE), verbose=TRUE]$c, as.raw(INT(7,1,0)),
output="Zero-copy coerce when assigning 'logical' to 'raw' column 3 named 'c'")
test(2005.31, set(DT,1L,"c",NA)$c, as.raw(INT(0,1,0)))
test(2005.32, set(DT,1:2,"c",INT(-1,255))$c, as.raw(INT(0,255,0)),
warning="-1.*integer.*position 1 taken as 0 when assigning.*raw.*column 3 named 'c'")
test(2005.33, DT[2:3,c:=INT(NA,256)]$c, as.raw(INT(0,0,0)),
warning="-2147483648.*integer.*position 1 taken as 0 when assigning.*raw.*column 3 named 'c'")
test(2005.34, set(DT,2:3,"c",c(NA,3.14))$c, as.raw(INT(0,0,3)),
warning="[nN].*double.*position 1 either truncated.*or taken as 0 when assigning.*raw.*column 3 named 'c'") # 'nan' for me but might vary hence [nN]
test(2005.35, DT[1:2,c:=c(Inf,0.78)]$c, as.raw(INT(0,0,3)),
warning="[iI].*double.*position 1 either truncated.*or taken as 0 when assigning.*raw.*column 3 named 'c'") # 'inf' for me but might vary hence [iI]
test(2005.36, DT[1:2,d:=as.raw(c(0,255))]$d, c(FALSE,TRUE,TRUE),
warning="255.*raw.*position 2 taken as TRUE when assigning.*logical.*column 4 named 'd'")
test(2005.37, DT[2:3,b:=as.raw(c(0,255))]$b, INT(4,0,255))
test(2005.38, DT[1:2,e:=as.raw(c(0,255))]$e, c(0,255,pi*3))
test(2005.39, DT[c(1,3,2), c:=as.raw(c(0,100,255)), verbose=TRUE]$c, as.raw(c(0,255,100)),
notOutput="coerce")
test(2005.40, DT[c(3,1), f:=as.raw(c(20,42))]$f, c(42+0i, 11, 20+0i))
test(2005.41, DT[2:3, f:=c(NA,FALSE)]$f, c(42+0i, NA, 0+0i))
test(2005.42, DT[c(1,3), f:=c(-42L,NA)]$f, c(-42+0i, NA, NA))
test(2005.43, DT[3:2, f:=c(pi,-Inf)]$f, c(-42+0i, -Inf+0i, pi+0i))
if (test_bit64) {
DT[,g:=as.integer64(c(-9,0,NA))]
test(2005.60, set(DT, 1L, "g", expression(x+2)), error="type 'expression' cannot be coerced to 'integer64'")
test(2005.61, DT[1:2,e:=as.integer64(c(NA,-200))]$e, c(NA_real_, -200, pi*3))
test(2005.62, DT[2:3, d:=as.integer64(c(2,NA))]$d, c(FALSE,TRUE,NA),
warning="2.*integer64.*position 1 taken as TRUE when assigning.*logical.*column 4 named 'd'")
DT[,b:=4:6]
test(2005.63, DT[2:3, b:=as.integer64(c("2147483647","2147483648"))]$b, INT(4,2147483647,NA),
warning="2147483648.*integer64.*position 2 out-of-range [(]NA[)] when assigning.*integer.*column 2 named 'b'")
test(2005.64, DT[2:3, b:=as.integer64(c("-2147483648","-2147483647"))]$b, INT(4,NA,-2147483647),
warning="-2147483648.*integer64.*position 1 out-of-range [(]NA[)].*integer.*column 2 named 'b'")
test(2005.65, DT[c(2,1,3), c:=as.integer64(c(-1,255,256))]$c, as.raw(c(255,0,0)),
warning="-1.*integer64.*position 1 taken as 0 when assigning.*raw.*column 3 named 'c'")
test(2005.66, DT[2:3, f:=as.integer64(c(NA,"2147483648"))]$f, as.complex(c(-42,NA,2147483648)))
DT[,h:=LETTERS[1:3]]
test(2005.67, DT[2:3, h:=as.integer64(1:2)], error="To assign integer64 to a character column, please use as.character.")
}
# rbindlist raw type, #2819
test(2006.1, rbindlist(list(data.table(x = as.raw(1), y=as.raw(3)), data.table(x = as.raw(2))), fill=TRUE), data.table(x=as.raw(1:2), y=as.raw(c(3,0))))
test(2006.2, rbindlist(list(data.table(x = as.raw(1:2), y=as.raw(5:6)), data.table(x = as.raw(3:5))), fill=TRUE), data.table(x=as.raw(1:5), y=as.raw(c(5:6,0,0,0))))
# rbindlist integer64, #1349
if (test_bit64) {
test(2007.1, rbindlist(list( list(a=as.integer64(1), b=3L), list(a=2L, b=4L) )), data.table(a=as.integer64(1:2), b=3:4))
test(2007.2, rbindlist(list( list(a=3.4, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6),
warning="Column 1 of item 1: 3.4.*double.*position 1 truncated.*precision lost.*when assigning.*integer64.*column 1 named 'a'")
test(2007.3, rbindlist(list( list(a=3.0, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6))
test(2007.4, rbindlist(list( list(b=5:6), list(a=as.integer64(4), b=7L)), fill=TRUE), data.table(b=5:7, a=as.integer64(c(NA,NA,4)))) # tests writeNA of integer64
test(2007.5, rbindlist(list( list(a=INT(1,NA,-2)), list(a=as.integer64(c(3,NA))) )), data.table(a=as.integer64(c(1,NA,-2,3,NA)))) # int NAs combined with int64 NA
test(2007.6, rbind(data.table(a=as.raw(10), b=5L), data.table(a=as.integer64(11), b=6L)), data.table(a=as.integer64(10:11), b=5:6))
}
# reworked ordered-factor handling in PR#3455, expanded from test for #3032
DT1 = data.table(x = ordered(vals<-c("b","b","e","f","c","c"), levels=c("f","b","e","c")))
DT2 = data.table(x = ordered(vals, levels=c("f","e","b","c")))
DT3 = data.table(x = ordered(vals, levels=c("f","b","e","c","d")))
DT4 = data.table(x = ordered(vals, levels=c("f","b","e","c","a","p")))
test(2008.1, DT1$x[3] < DT1$x[5]) # e<c; just to remind what ordered factors are
test(2008.2, factor(DT1$x, ordered=FALSE)[3] < factor(DT1$x, ordered=FALSE)[5], NA, warning="<.*not meaningful for factors") # base R's nice warning
test(2008.3, rbind(DT1, DT4), data.table(x=ordered(c(vals,vals), levels=c("f","b","e","c","a","p"))))
test(2008.4, rbind(DT1, DT2), data.table(x=factor(c(vals,vals), levels=c("f","b","e","c"))),
warning="Column 1 of item 2 is an ordered factor with 'e'<'b' in its levels. But 'b'<'e' in the ordered levels from column 1 of item 1.*regular factor")
test(2008.5, rbind(DT3, DT4), data.table(x=factor(c(vals,vals), levels=c("f","b","e","c","a","p","d"))),
warning="Column 1 of item 1.*level 5 [[]'d'[]] is missing.*column 1 of item 2. Each set.*should be an ordered subset of the first longest.*regular factor")
test(2008.6, rbindlist(list(DT1, DT2, DT3, DT4)), data.table(x=factor(rep(vals,4), levels=c("f","b","e","c","a","p","d"))),
warning="'e'<'b'.*But 'b'<'e'")
test(2008.7, rbindlist(list(DT1, list(c("e","b")), DT1)), data.table(x=ordered(c(vals,"e","b",vals), levels=c("f","b","e","c"))))
test(2008.8, rbindlist(list(DT1, list(c("e","foo")), DT1)), data.table(x=ordered(c(vals,"e","foo",vals), levels=c("f","b","e","c","foo"))))
# segfault comparing NULL column, #2303 #2305
DT = structure(list(NULL), names="a", class=c("data.table","data.frame"))
test(2009.1, DT[a>1], error="Column 1 is NULL; malformed data.table")
DT = null.data.table()
x = NULL
test(2009.2, DT[, .(x)], null.data.table()) # because .(x) evaluated to .(NULL); NULL columns in results removed
DT = data.table(A=1:3)
test(2009.3, DT[, .(x)], null.data.table())
test(2009.4, DT[, .(x, sum(A))], data.table(V1=6L))
test(2009.5, DT[, .(sum(A), x)], data.table(V1=6L))
test(2009.6, data.table(character(0), NULL), data.table(V1=character()))
test(2009.7, as.data.table(list(y = character(0), x = NULL)), data.table(y=character()))
# use.names="check" message|warning for out-of-order; https://github.com/Rdatatable/data.table/pull/3455#issuecomment-472744347
DT1 = data.table(a=1:2, b=5:6)
DT2 = data.table(b=7:8, a=3:4)
test(2010.01, rbindlist(list(DT1,DT2)), ans<-data.table(a=c(1:2,7:8), b=c(5:6,3:4)),
message="Column 2 [[]'a'[]] of item 2 appears in position 1 in item 1.*use.names=TRUE.*or use.names=FALSE.*v1.12.2")
test(2010.02, rbindlist(list(DT1,DT2), use.names=FALSE), ans)
test(2010.03, rbindlist(list(DT1,DT2), use.names=TRUE), data.table(a=1:4, b=5:8))
test(2010.04, rbindlist(list(DT1,DT2), use.names=NA), error="use.names=NA invalid")
test(2010.05, rbindlist(list(DT1,DT2), use.names='check'),
error="use.names='check' cannot be used explicitly because the value 'check' is new in v1.12.2 and subject to change. It is just meant to convey default behavior.")
options(datatable.rbindlist.check="warning")
test(2010.06, rbindlist(list(DT1,DT2)), ans, warning="item 2 appears in position 1.*See news item 5 in v1.12.2 for options to control this")
options(datatable.rbindlist.check="error")
test(2010.07, rbindlist(list(DT1,DT2)), error="item 2 appears in position 1.*See news item 5 in v1.12.2 for options to control this")
test(2010.08, rbindlist(list(DT1,data.table(foo=7:8, a=3:4))), error="Column 1 ['foo'] of item 2 is missing in item 1")
test(2010.09, rbindlist(list(DT1,data.table(V1=7:8, b=3:4))), error="Column 1 ['V1']") # automatic column names included in message (turned to error here) from 1.12.4; see news items
options(datatable.rbindlist.check="message")
test(2010.10, rbindlist(list(DT1,DT2)), ans, message="item 2 appears in position 1.*See news item 5 in v1.12.2 for options to control this")
options(datatable.rbindlist.check="none")
test(2010.11, rbindlist(list(DT1,DT2)), ans)
options(datatable.rbindlist.check="non")
test(2010.12, rbindlist(list(DT1,DT2)), ans, warning="options()$datatable.rbindlist.check=='non' which is not 'print'|'warning'|'error'|'none'. See news item 5 in v1.12.2")
options(datatable.rbindlist.check=2)
test(2010.13, rbindlist(list(DT1,DT2)), ans, warning="options()$datatable.rbindlist.check is set but is not a single string. See news item 5 in v1.12.2.",
message="item 2 appears in position 1.*See news item 5 in v1.12.2 for options to control this")
options(datatable.rbindlist.check=NULL) # this option is set to NULL at the top of this file too where the previous user-value is remembered and restored at the end of this file
# non-ASCII strings are correctly sorted on Windows, #3397
if (.Platform$OS.type == 'windows') local({
lc_collate <- Sys.getlocale(c('LC_COLLATE'))
lc_ctype <- Sys.getlocale(c('LC_CTYPE'))
Sys.setlocale('LC_COLLATE', "Chinese (Simplified)_China.936")
Sys.setlocale('LC_CTYPE', "Chinese (Simplified)_China.936")
on.exit({
Sys.setlocale('LC_COLLATE', lc_collate)
Sys.setlocale('LC_CTYPE', lc_ctype)
}, add = TRUE)
x1 <- '\u501f:Cash|\u501f:\u635f\u76ca\u7c7b-\u4ea4\u6613\u8d39\u7528|\u501f:\u635f\u76ca\u7c7b-\u4ef7\u5dee\u6536\u5165|\u501f:\u635f\u76ca\u7c7b-\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca|\u8d37:\u8d44\u4ea7\u7c7b-\u516c\u5141\u4ef7\u503c\u53d8\u52a8|\u8d37:\u8d44\u4ea7\u7c7b-\u6210\u672c'
x2 <- '\u501f:Cash|\u501f:\u635f\u76ca\u7c7b-\u4ea4\u6613\u8d39\u7528|\u501f:\u635f\u76ca\u7c7b-\u4ef7\u5dee\u6536\u5165|\u501f:\u635f\u76ca\u7c7b-\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca|\u8d37:\u8d44\u4ea7\u7c7b-\u516c\u5141\u4ef7\u503c\u53d8\u52a8|\u8d37:\u8d44\u4ea7\u7c7b-\u5e94\u8ba1\u5229\u606f|\u8d37:\u8d44\u4ea7\u7c7b-\u6210\u672c'
x <- enc2native(c(x2, x1))
dt1 <- data.table(a = x, b = 1, key = 'a')
dt2 <- data.table(a = enc2utf8(x), b = 1, key = 'a')
test(2011, dt1$a, dt2$a)
Sys.setlocale('LC_COLLATE', lc_collate) # restore for further tests in this file
Sys.setlocale('LC_CTYPE', lc_ctype)
})
# cbind of zero-row data.table with empty data.table messed up columns, #3445
test(2012.1, data.table(data.table(), data.table(a=integer())), data.table(a=integer()))
test(2012.2, data.table(data.frame(), data.table(a=integer())), data.table(a=integer()))
test(2012.3, data.table(data.frame(), data.frame(a=integer())), data.table(a=integer()))
dt = as.data.table(iris)
test(2012.4, cbind(data.table(), dt[0]), dt[0])
# extra validity checks in subsetDT on data.table; aside in #3369
DT = structure(list(a=1:3, b=NULL, c=4:6), class=c("data.table","data.frame"))
test(2013.1, DT[2], error="Column 2 is NULL; malformed data.table")
DT = structure(list(a=1:3, b=data.frame(foo=10:12,bar=13:15), c=4:6), class=c("data.table","data.frame"))
test(2013.2, DT[2], error="Column 2 ['b'] is a data.frame or data.table; malformed data.table.")
DT = structure(list(a=1:3, b=1:4, c=4:6), class=c("data.table","data.frame"))
test(2013.3, DT[2], error="Column 2 ['b'] is length 4 but column 1 is length 3; malformed data.table.")
## new fread keepLeadingZeros parameter in v1.12.2
# leading zeros in both integer and float numbers are converted to character when keepLeadingZeros=TRUE
test_data_single <- "0, 00, 01, 00010, 002.01\n"
test(2014.1, fread(test_data_single), data.table(0L, 0L, 1L, 10L, 2.01))
test(2014.2, fread(test_data_single, keepLeadingZeros = FALSE), data.table(0L, 0L, 1L, 10L, 2.01))
test(2014.3, fread(test_data_single, keepLeadingZeros = TRUE), data.table(0L, "00","01","00010","002.01"))
# converts whole column to character when keepLeadingZeros = TRUE and at least 1 value contains a leading zero
test_data_mult <- paste0(c(sample(1:100),"0010",sample(1:100)), collapse="\n")
test(2014.4, class(fread(test_data_mult, keepLeadingZeros = TRUE)[[1]]), "character")
test(2014.5, class(fread(test_data_mult, keepLeadingZeros = FALSE)[[1]]), "integer")
# rbindlist should drop NA from levels of source factors, relied on by package emil
test(2015.1, levels(rbindlist( list( data.frame(a=factor("a",levels=c("a",NA),exclude=NULL)) ))$a), "a") # the NA level (unused in this case) should not be retained
# follow-up from #3915; since this was malformed factor (so not relied on); lets just drop the used NA level too in v1.12.4 for these regular (not ordered) factors
DT = data.table(V1 = factor(as.character(c(NA, 1:3, NA)), exclude = NULL))
test(2015.2, list(levels(DT$V1), as.integer(DT$V1)), list(as.character(c(1:3,NA)), INT(4,1,2,3,4))) # the 4's are now moved to NA_integer_ by rbindlist
test(2015.3, unclass(rbindlist(list(DT), use.names=FALSE)$V1), setattr(INT(NA,1,2,3,NA), "levels", as.character(1:3)))
DT = data.table(V1 = factor(as.character(c(NA, 1:100, NA)), exclude = NULL))
test(2015.4, print(DT), output="V1.*1:[ ]+<NA>.*2:[ ]+1.*101:[ ]+100.*102:[ ]+<NA>")
DT = data.table(V1 = factor(as.character(c(NA, 1:3, NA)), exclude = NULL))
test(2015.5, print(DT), output="V1.*1:[ ]+<NA>.*2:[ ]+1.*4:[ ]+3.*5:[ ]+<NA>")
# better save->load->set(<new column>) message, #2996
DT = data.table(a=1:3)
save(list="DT", file=tt<-tempfile())
rm(DT)
name = load(tt)
test(2016.1, name, "DT")
test(2016.2, DT, data.table(a=1:3))
test(2016.3, DT[2,a:=4L], data.table(a=INT(1,4,3))) # no error for := when existing column
test(2016.4, set(DT,3L,1L,5L), data.table(a=INT(1,4,5))) # no error for set() when existing column
test(2016.5, set(DT,2L,"newCol",5L), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first") # just set()
test(2016.6, DT[2,newCol:=6L], data.table(a=INT(1,4,5), newCol=INT(NA,6L,NA))) # := ok (it changes DT in caller)
unlink(tt)
# gfirst(.SD) throws an error about not using head(.SD, n), but the latter works #2030
DT = data.table(id = c(1L,1L,2L), v = 1:3)
test(2017.1, DT[, first(.SD), by=id, .SDcols="v", verbose=TRUE], data.table(id=1:2, v=c(1L,3L)), output="optimized j to 'list(gfirst(v))'")
test(2017.2, DT[, first(v), by=id, verbose=TRUE], data.table(id=1:2, V1=c(1L,3L)), output="optimized j to 'gfirst(v)'")
test(2017.3, DT[, last(v), by=id, verbose=TRUE], data.table(id=1:2, V1=c(2L,3L)), output="optimized j to 'glast(v)'")
test(2017.4, DT[, v[1L], by=id, verbose=TRUE], data.table(id=1:2, V1=c(1L,3L)), output="optimized j to '`g[`(v, 1L)'")
DT = data.table(id = c(1L,1L,2L), v = 1:3, y = 3:1, z = c(TRUE, TRUE, FALSE), u = c("a","b","c"), l=list(list(1L), list(2L), list(3L)))
test(2017.5, DT[, first(.SD), by=id, .SDcols=c("v","y","z","u","l"), verbose=TRUE],
data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c"), l=list(list(1L), list(3L))),
output="optimized j to 'list(gfirst(v), gfirst(y), gfirst(z), gfirst(u), gfirst(l))'")
test(2017.6, DT[, last(.SD), by=id, .SDcols=c("v","y","z","u","l"), verbose=TRUE],
data.table(id=1:2, v=c(2L,3L), y=c(2L,1L), z=c(TRUE,FALSE), u=c("b","c"), l=list(list(2L), list(3L))),
output="optimized j to 'list(glast(v), glast(y), glast(z), glast(u), glast(l))'")
test(2017.7, DT[, .SD[1L], by=id, .SDcols=c("v","y","z","u","l"), verbose=TRUE],
data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c"), l=list(list(1L), list(3L))),
output="optimized j to 'list(`g[`(v, 1L), `g[`(y, 1L), `g[`(z, 1L), `g[`(u, 1L), `g[`(l, 1L))'")
# ghead argument "n" is missing, with no default #3462
DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1)
test(2018.1, DT[, head(.SD), a, verbose=TRUE],
data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1)),
output=c("lapply optimization changed j from 'head(.SD)' to 'list(head(b, n = 6L), head(d, n = 6L))'",
"GForce is on, left j unchanged"))
test(2018.2, DT[, head(b), a, verbose=TRUE],
data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12)),
output=c("lapply optimization is on, j unchanged as 'head(b)'",
"GForce is on, left j unchanged"))
test(2018.3, DT[, tail(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(2:7, 8:12), d=c(11:6, 5:1)))
test(2018.4, DT[, tail(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(2:7, 8:12)))
# gforce tests coverage
if (test_bit64) {
DT = data.table(id=c(rep(1L,3), rep(2L, 3)), v=bit64::as.integer64(c(1:3, 4L, 5:6)))
test(2019, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows
}
DT = data.table(id = c(1L,1L,2L), v = as.raw(0:2))
test(2020.01, DT[, min(v), by=id], error="'raw' not supported by GForce min")
test(2020.02, DT[, max(v), by=id], error="'raw' not supported by GForce max")
test(2020.03, DT[, median(v), by=id], error="'raw' not supported by GForce median")
test(2020.04, DT[, head(v, 1), by=id], error="'raw' not supported by GForce head")
test(2020.05, DT[, tail(v, 1), by=id], error="'raw' not supported by GForce tail")
test(2020.06, DT[, v[1], by=id], error="'raw' not supported by GForce subset")
test(2020.07, DT[, sd(v), by=id], error="'raw' not supported by GForce sd")
test(2020.08, DT[, var(v), by=id], error="'raw' not supported by GForce var")
test(2020.09, DT[, prod(v), by=id], error="'raw' not supported by GForce prod")
DT = data.table(id = c(1L,1L,2L,2L), v = c(1L, 2L, NA, NA))
test(2020.10, DT[, median(v), id], data.table(id=1:2, V1=c(1.5, NA))) # median whole group has NAs
# setorder can keep key if the order does not change, #3456
DT = data.table(
a = rep(4:1, 1:4),
b = 1:10,
v = c(2L, 4L, 10L, 5L, 9L, 6L, 7L, 8L, 3L, 1L)
)
setkey(DT, a)
setorder(DT, a, -v)
test(2021.1, key(DT), 'a')
setorder(DT, -a, v)
test(2021.2, key(DT), NULL)
setkey(DT, a, b)
setorder(DT, b)
test(2021.3, key(DT), NULL)
# assign to list column works now when RHS is not list, #950
d = data.table(id=c("a","b"), f=list(function(x) x*2, function(x) x^2), key="id")
test(2022.1, d[.("a"), f:=function(x)x^3], data.table(id=c("a","b"), f=list(function(x) x^3, function(x) x^2), key="id"))
test(2022.2, d[.("a"), f:=list(function(x) x^4)], data.table(id=c("a","b"), f=list(function(x) x^4, function(x) x^2), key="id"))
test(2022.3, d[2, f:=6:8], data.table(id=c("a","b"), f=list(function(x) x^4, 6:8), key="id"))
test(2022.4, d[.("b"), f:=list(list(function(x) x^3))], data.table(id=c("a","b"), f=list(function(x) x^4, function(x) x^3), key="id"))
# keyby= used wrong index where "CLASS" is leading subset of characters of "CLASS_L3" and index exists on CLASS_L3, #3498
DT = data.table(
CLASS_L3 = c("gggg", "iiii", "bbbb", "bbbb", "gggg", "ffff", "bbbb", "Repo", "bbbb", "dddd", "hhhh", "dddd", "gggg", "dddd"),
CLASS = c("aaaa", "dddd", "gggg", "gggg", "aaaa", "eeee", "eeee", "ffff", "gggg", "aaaa", "aaaa", "aaaa", "aaaa", "aaaa"))
test(2023.1, indices(DT), NULL)
test(2023.2, DT[, .N, keyby = CLASS], ans<-data.table(CLASS=c("aaaa","dddd","eeee","ffff","gggg"), N=INT(7,1,2,1,3), key="CLASS"))
test(2023.3, indices(DT), NULL)
setindex(DT, CLASS_L3)
test(2023.4, indices(DT), "CLASS_L3")
test(2023.5, DT[, .N, keyby = CLASS], ans) # just this test failed in v1.12.2 and before due to using the CLASS_L3 index incorrectly
test(2023.6, DT[, .N, by = CLASS], data.table(CLASS=c("aaaa","dddd","gggg","eeee","ffff"), N=INT(7,1,3,2,1)))
# more verbose timings #1265
DT = data.table(x=c("a","b","c","b","a","c"), y=c(1,3,6,1,6,3), v=1:6)
setindex(DT, y)
test(2024, DT[y==6, v:=10L, verbose=TRUE], output="Constructing irows for.*")
# fread embedded '\0', #3400
test(2025.01, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3,4), B=INT(2,2,5), C=INT(3,1,6)))
f = tempfile()
for (nNUL in 0:3) {
writeBin(c(charToRaw("a=b\nA B C\n1 3 5\n"), rep(as.raw(0), nNUL), charToRaw("2 4 6\n")), con=f)
test_no = 2025 + (1+nNUL)/10
test(test_no + .01, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6))
test(test_no + .02, fread(f), ans) # auto detect skip and header works too
writeBin(c(charToRaw("a=b\nA,B,C\n1,3,5\n"), rep(as.raw(0), nNUL), charToRaw("2,4,6\n")), con=f)
test(test_no + .03, fread(f, skip=1, header=TRUE), ans)
test(test_no + .04, fread(f), ans)
writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A B C\n1 3 5\n2 4 6\n")), con=f)
test(test_no + .05, fread(f, skip=1, header=TRUE), ans)
test(test_no + .06, fread(f), ans)
writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A,B,C\n1,3,5\n2,4,6\n")), con=f)
test(test_no + .07, fread(f, skip=1, header=TRUE), ans)
test(test_no + .08, fread(f), ans)
}
makeNul = function(str){ tt=charToRaw(str); tt[tt==42L]=as.raw(0); writeBin(tt, con=f)} # "*" (42) represents NUL
makeNul("A,B,C\n1,foo,5\n2,*bar**,6\n")
test(2025.51, fread(f), data.table(A=1:2, B=c("foo","bar"), C=5:6))
makeNul('A,B,C\n1,foo*bar,3\n2,**"**b*az*",4\n')
test(2025.52, fread(f), data.table(A=1:2, B=c("foobar","baz"), C=3:4))
# printing timezone, #2842
DT = data.table(t1 = as.POSIXct("1982-04-26 13:34:56", tz = "Europe/Madrid"),t2 = as.POSIXct("2019-01-01 19:00:01",tz = "UTC"))
test(2026.1, capture.output(print(DT))[2], "1: 1982-04-26 13:34:56 2019-01-01 19:00:01")
test(2026.2, capture.output(print(DT,timezone = TRUE))[2], "1: 1982-04-26 13:34:56 Europe/Madrid 2019-01-01 19:00:01 UTC")
DT = data.table(v1 = c(1,as.numeric(NA)))
DT[2,t:= as.POSIXct("2019-01-01 19:00:01",tz = "UTC")]
test(2026.3, capture.output(print(DT)), c(" v1 t","1: 1 <NA>", "2: NA 2019-01-01 19:00:01"))
test(2026.4, capture.output(print(DT, timezone = TRUE)), c(" v1 t","1: 1 <NA>","2: NA 2019-01-01 19:00:01 UTC"))
# empty item in j=list(x, ) errors gracefully, #3507
DT = data.table(a = 1:5)
test(2027.1, DT[, list(1, ), by=a], error = 'Item 2 of the .() or list() passed to j is missing')
test(2027.2, DT[, list(1,2,)], error = 'Item 3 of the .() or list() passed to j is missing')
test(2027.3, DT[, .(1,,3,), by=a], error = 'Item 2 of the .() or list() passed to j is missing')
# fread quote="" when last line too short and filled with fill=TRUE (provided via email), crash in v1.12.2 and before
if (test_R.utils) {
test(2028.1, fread(testDir("noquote.csv.gz"), fill=TRUE, quote="")[c(1,.N), c(1,2,9,10)],
data.table(H=c("D","T"), "Locate Reply"=c("BCS","Locate Reply"), V9=c("A",""), V10=c("4/23/2019 7:11:11 AM","")))
test(2028.2, fread(testDir("noquote.csv.gz"), fill=TRUE, quote="", header=FALSE)[c(1,.N), c(1,2,3,8,9,10)],
data.table(V1=c("H","T"), V2="Locate Reply", V3=c("GORLTR","2093"), V8=INT(NA,NA), V9="", V10=""))
}
txt = "A,B,C\n1,4,7\n2,5,8\n3,6\n"
test(2029.1, fread(txt), data.table(A=1:2, B=4:5, C=7:8), warning="Discarded single-line footer: <<3,6>>")
test(2029.2, fread(txt, quote=""), data.table(A=1:2, B=4:5, C=7:8), warning="Discarded single-line footer: <<3,6>>")
test(2029.3, fread(txt, quote="", fill=TRUE), data.table(A=1:3, B=4:6, C=c(7:8,NA)))
# .Last.updated #1885
d = data.table(a=1:4, b=2:5)
d[, z:=5L]
test(2030.01, .Last.updated, 4L) # new column
d[, z:=6L]
test(2030.02, .Last.updated, 4L) # update existing column
d[2:3, z:=7L]
test(2030.03, .Last.updated, 2L) # sub assign
d[integer(), z:=8L]
test(2030.04, .Last.updated, 0L) # empty sub-assign
d[-1L, z:=9L]
test(2030.05, .Last.updated, 3L) # inverse sub-assign
d[-(1:4), z:=10L]
test(2030.06, .Last.updated, 0L) # inverse empty sub-assign
d[, z:=NULL]
test(2030.07, .Last.updated, 4L) # delete column
d[2:3, z:=11L]
test(2030.08, .Last.updated, 2L) # new column during sub-assign
d[, z:=NULL]
d[integer(), z:=12L]
test(2030.09, .Last.updated, 0L) # new columns from empty sub-assign
d[, z:=NULL]
d[-(1:4), z:=13L]
test(2030.10, .Last.updated, 0L) # new columns from empty inverse sub-assign
d[, z:=NULL][, z:=14L]
test(2030.11, .Last.updated, 4L) # new column from chaining
d[, z:=NULL][2:3, z:=14L]
test(2030.12, .Last.updated, 2L) # sub-assign from chaining
d[2:3, z:=14L][, z:=NULL]
test(2030.13, .Last.updated, 4L) # delete column from chaining
set(d, 1:2, "z", 15L)
test(2030.14, .Last.updated, 2L) # set() updates .Last.updated too
g = data.table(a=1:4, z=15L) # join
d[g, on="a", z:=i.z]
test(2030.15, .Last.updated, 4L) # all match of all rows
g = data.table(a=2:4, z=16L) # join
d[, z:=NULL][g, on="a", z:=i.z]
test(2030.16, .Last.updated, 3L) # all match
g = data.table(a=c(2L,4L,6L), z=17L)
d[, z:=NULL][g, on="a", z:=i.z]
test(2030.17, .Last.updated, 2L) # partial match
g = data.table(a=5:6, z=18L)
d[, z:=NULL][g, on="a", z:=i.z]
test(2030.18, .Last.updated, 0L) # zero match
# rbind vec with list regression dev 1.12.3; #3528
test(2031.01, rbind(data.table(A=1:3, B=7:9), data.table(A=4:6, B=as.list(10:12))), ans<-data.table(A=1:6, B=as.list(7:12)))
test(2031.02, rbind(data.table(A=1:3, B=as.list(7:9)), data.table(A=4:6, B=10:12)), ans)
if (test_yaml) { # csvy; #1701
f = testDir("csvy/test.csvy")
DT = data.table(var1 = c("A", "B"),
var2 = c(1L, 3L),
var3 = c(2.5, 4.3))
DT_yaml = copy(DT)
setattr(DT_yaml, 'yaml_metadata',
list(name = "my-dataset",
source = "https://github.com/leeper/csvy/tree/master/inst/examples",
schema = list(fields = list(
list(name = "var1", title = "variable 1", type = "string",
description = "explaining var1",
constraints = list(list(required = TRUE))),
list(name = "var2", title = "variable 2", type = "integer"),
list(name = "var3", title = "variable 3", type = "number")
))))
## with skip = '__auto__', fread can figure out
## how to start after the metadata (just ignoring it)
test(2032.01, fread(f), DT)
## should be the same, but with yaml_metadata attribute
test(2032.02, fread(f, yaml = TRUE), DT_yaml)
## testing verbose messaging
test(2032.03, fread(f, yaml = TRUE, verbose = TRUE),
DT_yaml, output = 'Processed.*YAML metadata.*')
## this file is identical, except the body of the
## YAML header is commented out with # (should read identically)
test(2032.04,
fread(testDir('csvy/test_comment.csvy'), yaml = TRUE),
DT_yaml)
## user input is taken as most intentional & overrides YAML
DT_yaml[ , var2 := as.numeric(var2)]
test(2032.05, fread(f, yaml = TRUE, colClasses = list(numeric = 'var2')),
DT_yaml, message = 'colClasses.*YAML header are in conflict.*var2')
## extraneous/unused fields shouldn't throw off reading
DT = fread(testDir('csvy/test_extraneous.csvy'), yaml = TRUE)
test(2032.06, names(DT), c('Date', 'WTI'))
test(2032.07, attr(DT, 'yaml_metadata'),
list(names = c("Date", "WTI"), class = "data.frame",
title = "Cushing, OK WTI Spot Price FOB", filename = "data.csv",
fileurl = "https://raw.githubusercontent.com/jrovegno/csvy/master/data.csv",
sourceurl = "http://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=RWTC&f=D",
source_csvy = "https://github.com/leeper/csvy/tree/master/inst/examples",
item = "PET", sourcekey = "RWTC", freq = "Daily",
rate = "MID", type = "price", units = "Dollars per Barrel",
latestdate = "2015-08-31", releasedate = "2015-09-02",
nextreleasedate = "2015-09-10", source = "Thomson Reuters",
contactemail = "infoctr@eia.doe.gov", contactphone = "(202) 586-8800"))
## yaml can also handle sep, dec, quote, and na.strings
DT_out = data.table(var1 = c("A", "B"),
var2 = c(1L, NA),
var3 = c(2.5, 4.3))
meta =
list(name = NULL,
schema = list(fields = list(
list(name = "var1", title = "variable 1", type = "string",
description = "a single-quoted character variable"),
list(name = "var2", title = "variable 2", type = "integer"),
list(name = "var3", title = "variable 3", type = "number",
description = "European-style numeric")
)),
header = TRUE, sep = "|", dec = ",",
quote = "'", na.strings = "@")
attr(DT_out, 'yaml_metadata') = meta
test(2032.08, fread(testDir( 'csvy/test_attributes.csvy'), yaml = TRUE), DT_out)
## user-specified attributes can override data from YAML
meta$sep = "-"
setattr(DT_out, 'yaml_metadata', meta)
test(2032.09, fread(testDir('csvy/test_override_sep.csvy'), yaml = TRUE, sep = '|'), DT_out,
message = 'User-supplied.*sep.*override')
meta$sep = "|"
setattr(DT_out, 'yaml_metadata', meta)
test(2032.10, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE),
DT_out, message = 'User-supplied.*header.*override')
col.names = c('x', 'y', 'z')
setnames(DT_out, col.names)
test(2032.11, fread(testDir('csvy/test_override_header.csvy'), yaml = TRUE, header = FALSE, col.names = col.names), DT_out,
message = c('User-supplied.*header.*override', 'User-supplied.*col.names.*override'))
test(2032.12, fread(testDir('csvy/test_attributes.csvy'), yaml = TRUE, col.names = col.names),
DT_out, message = 'User-supplied.*col.names')
setnames(DT_out, c('var1', 'var2', 'var3'))
meta$quote = "^"
setattr(DT_out, 'yaml_metadata', meta)
test(2032.13, fread(testDir('csvy/test_override_quote.csvy'), yaml = TRUE, quote = "'"),
DT_out, message = 'User-supplied.*quote')
meta$quote = "'"
meta$dec = "."
setattr(DT_out, 'yaml_metadata', meta)
test(2032.14, fread(testDir('csvy/test_override_dec.csvy'), yaml = TRUE, dec = ','),
DT_out, message = 'User-supplied.*dec')
meta$dec = ','
meta$na.strings = 'NA'
setattr(DT_out, 'yaml_metadata', meta)
test(2032.15, fread(testDir('csvy/test_override_na.csvy'), yaml = TRUE, na.strings = '@'),
DT_out, message = 'User-supplied.*na.strings')
## error if YAML malformed
test(2032.16, fread(testDir('csvy/test_incomplete_header.csvy'), yaml = TRUE),
error = 'Reached the end.*YAML.*valid csvy')
## use any other CSV in test directory which doesn't have YAML
if (test_R.utils) test(2032.17, fread(testDir('issue_2051.csv.gz'), yaml = TRUE),
error = 'Encountered.*unskipped.*constitute.*valid YAML')
## no problem if some fields are missing a type (just
## resort to standard auto-inferral, i.e., identical to
## the case of partially-specified colClasses)
DT = data.table(var1 = c("A", "B"), var2 = c(1L, 3L),
var3 = c(2.5, 4.3))
setattr(DT, 'yaml_metadata',
list(name = "my-dataset", source = "https://github.com/leeper/csvy/tree/master/inst/examples",
schema = list(fields = list(
list(name = "var1"), list(name = "var2", type = "integer"),
list(name = "var3", type = "number")
))))
test(2032.18, fread(testDir('csvy/test_missing_type.csvy'), yaml = TRUE), DT)
## skip applies starting after the YAML header
setattr(DT, 'yaml_metadata',
list(schema = list(fields = list(
list(name = "var1", type = "string"),
list(name = "var2", type = "integer"),
list(name = "var3", type = "number")
))))
test(2032.19, fread(testDir('csvy/test_skip.csvy'), yaml = TRUE, skip = 2L), DT)
## user-supplied col.names override metadata (as for colClasses)
cn = paste0('V', 1:3)
setnames(DT, cn)
test(2032.20, fread(testDir('csvy/test_skip.csvy'),
yaml = TRUE, skip = 2L, col.names = cn),
DT, message = 'User-supplied column names.*override.*YAML')
## invalid value fails
test(2032.21, fread(f, yaml = 'gobble'),
error = 'isTRUEorFALSE\\(yaml\\) is not TRUE')
## warning that skip-as-search doesn't work with yaml
DT_yaml[ , var2 := as.integer(var2)]
test(2032.22, fread(f, skip = 'var1,', yaml = TRUE),
DT_yaml, warning = 'Combining a search.*YAML.*')
# fwrite csvy: #3534
tmp = tempfile()
DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5])
# force eol for platform independence
fwrite(DT, tmp, yaml = TRUE, eol = '\n')
as_read = readLines(tmp)
test(2033.01, as_read[c(1L, 24L)], c('---', '---'))
test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L]))
test(2033.03, grepl('creation_time_utc', as_read[3L]))
test(2033.04, as_read[4:23],
c("schema:", " fields:", " - name: a", " type: integer",
" - name: b", " type: numeric", " - name: c", " type: character",
"header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''",
# NB: apparently \n is encoded like this in YAML
"eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double",
"logical01: no"))
tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e")
test(2033.05, as_read[25:30], tbl_body)
# windows eol
fwrite(DT, tmp, yaml = TRUE, eol = '\r\n')
test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"')
# multi-class columns
DT[ , t := .POSIXct(1:5, tz = 'UTC')]
fwrite(DT, tmp, yaml = TRUE)
as_read = readLines(tmp)
test(2033.07, as_read[13L], " type: POSIXct")
# ~invertibility~
# fread side needs to be improved for Hugh's colClasses update
DT[ , t := NULL]
fwrite(DT, tmp, yaml = TRUE)
DT2 = fread(tmp, yaml = TRUE)
# remove metadata to compare
attr(DT2, 'yaml_metadata') = NULL
test(2033.08, all.equal(DT, DT2))
test(2033.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE),
output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*"))
# TODO: test gzip'd yaml which is now supported
# yaml + bom arguments
DT = data.table(l=letters, n=1:26)
fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE)
fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50
lines = readLines(fcon)
lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows
# remove the blank here so we don't need to change this test if/when that changes in yaml package
test(2033.11, length(lines), 48L)
close(fcon)
test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d)))
# re-write should have same output (not appended)
fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE)
fcon = file(f, encoding="UTF-8")
lines = readLines(fcon)
lines = lines[lines!=""]
test(2033.13, length(lines), 48L)
close(fcon)
test(2033.14, fread(f), DT)
unlink(f)
}
# fcast coverage
DT = data.table(a = rep(1:2, each = 2), b = rep(1:2, 2), c = 4:1, d = 5:8)
test(2034.1,
dcast(DT, a ~ b, value.var = list('c', 'd'), fun.aggregate = list(sum)),
error = "When 'fun.aggregate' and 'value.var' are both lists")
# fread no quote coverage
test(2035.1, fread('A,B\n"foo","ba"r"', quote="''"), error='quote= must be a single character, blank "", or FALSE')
test(2035.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"', B='"ba"r"'))
test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans)
# source() printing edge case; #2369
setup = c('DT = data.table(a = 1)')
writeLines(c(setup, 'DT[ , a := 1]'), tmp<-tempfile())
test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo = TRUE)), fixed = TRUE)))
## test force-printing still works
writeLines(c(setup, 'DT[ , a := 1][]'), tmp)
test(2036.2, source(tmp, echo = TRUE), output = "1:\\s+1")
# more helpful guidance when assigning before setDT() after readRDS(); #1729
DT = data.table(a = 1:3)
saveRDS(DT, tmp<-tempfile())
rm(DT)
DT = readRDS(tmp)
foo = function(x) { x[ , b:=4:6, verbose=TRUE][] }
test(2037.1, foo(DT), output='Please remember to always setDT()')
# no assignment was made to DT
test(2037.2, names(DT), 'a')
# _selrefok() verbose message was duplicated
test(2037.3, unname(table(unlist(strsplit(capture.output(foo(DT)), '\n|\\s+')))['ptr']), 1L)
# `between` invalid args, and verbose #3516
test(2038.01, between(1:5, 2, 4, incbounds=423), error="incbounds must be TRUE or FALSE")
test(2038.02, between(1:5, 2, 4, incbounds=NA), error="incbounds must be TRUE or FALSE")
old = options(datatable.verbose=TRUE)
test(2038.03, between(1:5, 2L, 4L), output="between parallel processing of integer took")
test(2038.04, between(1:5, rep(2L,5L), rep(4L, 5L)), output="between parallel processing of integer took")
test(2038.05, between(as.double(1:5), 2, 4, incbounds=FALSE), output="between parallel processing of double with open bounds took")
test(2038.06, between(as.double(1:5), 2, 4), output="between parallel processing of double with closed bounds took")
test(2038.07, between(as.double(1:5), rep(2, 5L), rep(4, 5L)), output="between parallel processing of double with closed bounds took")
test(2038.08, between(c("foo","bar","paz"), "bag", "fog"), output="between non-parallel processing of character took")
# `between` handle POSIXct type
x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15
dn = as.POSIXct('2016-09-18 08:00:00')
up = as.POSIXct('2016-09-18 09:00:00')
test(2038.09, between(x, dn, up), output="between parallel processing of double with closed bounds took")
test(2038.10, between(x, dn, up, incbounds=FALSE), output="between parallel processing of double with open bounds took")
# also handling of string lower/upper bounds
x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 # local time zone
dn = '2016-09-18 08:00:00'
up = '2016-09-18 09:00:00'
test(2038.11, between(x, dn, up), ans<-as.logical(c(0,0,0,0,1,1,1,1,1,0,0)), output='between parallel processing of double with closed bounds')
attr(x, 'tzone') = 'UTC' # this conversion will result in different UTC times depending on which local time zone the test runs in
test(2038.12, between(x, x[5], x[9]), ans, output='between parallel processing of double with closed bounds')
# additional flexibility -- cast when one bound is already POSIXct
test(2038.13, between(x, as.character(x[5]), x[9]), ans, output='between parallel processing of double')
options(old)
# exceptions in char to POSIX coercion
dn = 'aa2016-09-18 08:00:00'
test(2038.14, between(x, dn, up), error="coercion to POSIX failed")
dn = '2016-09-18 08:00:00'
up = 'bb2016-09-18 09:00:00'
test(2038.15, between(x, dn, up), error="coercion to POSIX failed")
# exceptions due to timezone mismatch
x = as.POSIXct("2016-09-18 07:00:00", tz="UTC") + 0:10*60*15
dn = as.POSIXct('2016-09-18 08:00:00')
up = '2016-09-18 09:00:00'
test(2038.16, between(x, dn, up),
error="'between' lower= and upper= are both POSIXct but have different tzone attributes: ['', 'UTC']. Please align their time zones.")
up = as.POSIXct('2016-09-18 09:00:00')
if (attr(dn,"tzone",exact=TRUE)!="UTC") { # normally the case on laptops (local time zone) but may be UTC on CRAN test machine for example
test(2038.17, length(between(x, dn, up)), 11L, # true/false may depend on the local time zone so just test length
message="'between' arguments are all POSIXct but have mismatched tzone attributes: ['UTC', '', '']. The UTC times will be compared.")
}
# `between` support `.` in RHS #2315
X = data.table(a = 1:5, b = 6:10, c = c(5:1))
test(2038.20, X[c %between% list(a, b)], X[c %between% .(a, b)])
# between num to int coercion #3517
old = options("datatable.verbose"=TRUE)
test(2038.31, between(1:5, 2, 4), output="between parallel processing of integer")
test(2038.32, between(1:5, 2L, 4), output="between parallel processing of integer")
test(2038.33, between(1:5, 2, 4L), output="between parallel processing of integer")
# revdep regression #3565
test(2038.34, between(1:10, -Inf, Inf), rep(TRUE,10), output="between parallel processing of double with closed bounds")
test(2038.35, between(as.double(1:10), -Inf, Inf), rep(TRUE,10), output="between parallel processing of double with closed bounds")
options(old)
# between int64 support
if (test_bit64) {
as.i64 = bit64::as.integer64
test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="x is not integer64 but.*Please align classes")
test(2039.02, between(1:10, 3, as.i64(6)), error="x is not integer64 but.*Please align classes")
test(2039.03, between(as.i64(1:3), "2", as.i64(4)), error="x is integer64 but lower and/or upper are not")
old = options("datatable.verbose"=TRUE)
x = as.i64(1:10)
ans36 = c(FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE)
ans36open = c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE)
test(2039.04, between(x, 3, 6), ans36, output="between parallel processing of integer64 took")
test(2039.05, between(x, 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took")
test(2039.06, between(x, 3L, 6), ans36, output="between parallel processing of integer64 took")
test(2039.07, between(x, 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took")
test(2039.08, between(x, 3, 6L), ans36, output="between parallel processing of integer64 took")
test(2039.09, between(x, 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took")
test(2039.10, between(x, 3L, 6L), ans36, output="between parallel processing of integer64 took")
test(2039.11, between(x, 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took")
test(2039.12, between(x, rep(3, 10L), rep(6, 10L)), ans36, output="between parallel processing of integer64 took")
test(2039.13, between(x, rep(3, 10L), rep(6, 10L), incbounds=FALSE), ans36open, output="between parallel processing of integer64 took")
maxint = 2147483647
test(2039.14, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took")
test(2039.15, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took")
x[5] = NA
ans36[5] = NA
ans36open[5] = NA
test(2039.16, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took")
test(2039.17, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took")
test(2039.18, between(x+maxint, NA, 6+maxint), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took")
test(2039.19, between(x+maxint, 3+maxint, NA, incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took")
test(2039.20, between(x+maxint, rep(NA, 10L), rep(6+maxint, 10L)), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took")
test(2039.21, between(x+maxint, rep(3+maxint, 10L), rep(NA, 10L), incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took")
options(old)
}
# zero rows table group by NULL #3530
DT = data.table(x = c("a","b","a","b"), y = c(1,2,3,4))
test(2040.1, DT[0, .N, by = NULL], data.table(N=0L))
f = function(...) NULL
test(2040.2, DT[0, .N, by = f()], data.table(N=0L))
# gmedian retaining class; #3079
DT = data.table(date = as.Date(c("2018-01-01", "2018-01-03", "2018-01-08", "2018-01-10", "2018-01-25", "2018-01-30")),
g = rep(letters[1:2], each = 3))
DT[, time:=as.POSIXct(date)]
test(2041.1, DT[, median(date), by=g], data.table(g=c("a","b"), V1=as.Date(c("2018-01-03","2018-01-25"))))
test(2041.2, DT[, median(time), by=g], DT[c(2,5),.(g=g, V1=time)])
# 'invalid trim argument' with optimization level 1; #1876
test(2042.1, DT[ , as.character(mean(date)), by=g, verbose=TRUE ],
data.table(g=c("a","b"), V1=c("2018-01-04","2018-01-21")),
output=msg<-"GForce is on, left j unchanged.*Old mean optimization is on, left j unchanged")
# Since %b is e.g. "janv." in LC_TIME=fr_FR.UTF-8 locale, we need to
# have the target/y value in these tests depend on the locale as well, #3450.
Jan.2018 = format(strptime("2018-01-01", "%Y-%m-%d"), "%b-%Y")
test(2042.2, DT[ , format(mean(date),"%b-%Y")], Jan.2018)
test(2042.3, DT[ , format(mean(date),"%b-%Y"), by=g, verbose=TRUE ], # just this case generated the error
data.table(g=c("a","b"), V1=c(Jan.2018, Jan.2018)), output=msg)
# gforce wrongly applied to external variable; #875
DT = data.table(x=INT(1,1,1,2,2), y=1:5)
z = 1:5
options(datatable.optimize = Inf)
test(2043.1, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5)))
options(datatable.optimize = 1L)
test(2043.2, DT[, list(mean(z), mean(y)), by=x], ans)
options(datatable.optimize = 0L)
test(2043.3, DT[, list(mean(z), mean(y)), by=x], ans)
options(datatable.optimize = Inf)
test(2043.4, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5))))
z = 1:4
test(2043.5, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z
# test type coercion in joins, #2592
dt1 <- data.table(int = 1L:10L,
doubleInt = as.numeric(1:10), # integers stored as double
realDouble = seq(0.5, 5, by = 0.5), # fractions present
bool = c(rep(FALSE, 9), TRUE),
char = letters[1L:10L],
fact = factor(letters[1L:10L]),
raw = as.raw(1:5))
dt2 <- data.table(int = 1L:5L,
doubleInt = as.numeric(1:5),
realDouble = seq(0.5, 2.5, by = 0.5),
bool = TRUE,
char = letters[1L:5L],
fact = factor(letters[1L:5L]),
raw = as.raw(1:5))
if (test_bit64) {
dt1[, int64 := as.integer64(c(1:9, 3e10))]
dt2[, int64 := as.integer64(c(1:4, 3e9))]
}
## no coercion when types match
test(2044.01, nrow(dt1[dt2, on="bool==bool", verbose=TRUE]), nrow(dt2), output="No coercion needed")
test(2044.02, nrow(dt1[dt2, on="int==int", verbose=TRUE]), nrow(dt2), output="No coercion needed")
test(2044.03, nrow(dt1[dt2, on="doubleInt==doubleInt", verbose=TRUE]), nrow(dt2), output="No coercion needed")
test(2044.04, nrow(dt1[dt2, on="realDouble==realDouble", verbose=TRUE]), nrow(dt2), output="No coercion needed")
test(2044.05, nrow(dt1[dt2, on="doubleInt==realDouble", verbose=TRUE]), nrow(dt2), output="No coercion needed")
test(2044.06, nrow(dt1[dt2, on="realDouble==doubleInt", verbose=TRUE]), nrow(dt2), output="No coercion needed")
test(2044.07, nrow(dt1[dt2, on="char==char", verbose=TRUE]), nrow(dt2), output="No coercion needed")
test(2044.08, nrow(dt1[dt2, on="fact==fact", verbose=TRUE]), nrow(dt2), output="Matching i.fact factor levels to x.fact factor levels")
if (test_bit64) {
test(2044.09, nrow(dt1[dt2, on = "int64==int64", verbose=TRUE]), nrow(dt2), output="No coercion needed")
}
test(2044.10, dt1[dt2, on = "int==raw"], error = "i.raw is type raw which is not supported by data.table join")
test(2044.11, dt1[dt2, on = "raw==int"], error = "x.raw is type raw which is not supported by data.table join")
# incompatible types
test(2044.20, dt1[dt2, on="bool==int"], error="Incompatible join types: x.bool (logical) and i.int (integer)")
test(2044.21, dt1[dt2, on="bool==doubleInt"], error="Incompatible join types: x.bool (logical) and i.doubleInt (double)")
test(2044.22, dt1[dt2, on="bool==realDouble"], error="Incompatible join types: x.bool (logical) and i.realDouble (double)")
test(2044.23, dt1[dt2, on="bool==char"], error="Incompatible join types: x.bool (logical) and i.char (character)")
test(2044.24, dt1[dt2, on="bool==fact"], error="Incompatible join types: x.bool (logical) and i.fact (factor)")
test(2044.25, dt1[dt2, on="int==bool"], error="Incompatible join types: x.int (integer) and i.bool (logical)")
test(2044.26, dt1[dt2, on="int==char"], error="Incompatible join types: x.int (integer) and i.char (character)")
test(2044.27, dt1[dt2, on="int==fact"], error="Incompatible join types: x.int (integer) and i.fact (factor)")
test(2044.28, dt1[dt2, on="doubleInt==bool"], error="Incompatible join types: x.doubleInt (double) and i.bool (logical)")
test(2044.29, dt1[dt2, on="doubleInt==char"], error="Incompatible join types: x.doubleInt (double) and i.char (character)")
test(2044.30, dt1[dt2, on="doubleInt==fact"], error="Incompatible join types: x.doubleInt (double) and i.fact (factor)")
test(2044.31, dt1[dt2, on="realDouble==bool"], error="Incompatible join types: x.realDouble (double) and i.bool (logical)")
test(2044.32, dt1[dt2, on="realDouble==char"], error="Incompatible join types: x.realDouble (double) and i.char (character)")
test(2044.33, dt1[dt2, on="realDouble==fact"], error="Incompatible join types: x.realDouble (double) and i.fact (factor)")
test(2044.34, dt1[dt2, on="char==bool"], error="Incompatible join types: x.char (character) and i.bool (logical)")
test(2044.35, dt1[dt2, on="char==int"], error="Incompatible join types: x.char (character) and i.int (integer)")
test(2044.36, dt1[dt2, on="char==doubleInt"], error="Incompatible join types: x.char (character) and i.doubleInt (double)")
test(2044.37, dt1[dt2, on="char==realDouble"], error="Incompatible join types: x.char (character) and i.realDouble (double)")
test(2044.38, dt1[dt2, on="fact==bool"], error="Incompatible join types: x.fact (factor) and i.bool (logical)")
test(2044.39, dt1[dt2, on="fact==int"], error="Incompatible join types: x.fact (factor) and i.int (integer)")
test(2044.40, dt1[dt2, on="fact==doubleInt"], error="Incompatible join types: x.fact (factor) and i.doubleInt (double)")
test(2044.41, dt1[dt2, on="fact==realDouble"], error="Incompatible join types: x.fact (factor) and i.realDouble (double)")
if (test_bit64) {
test(2044.42, dt1[dt2, on = "bool==int64"], error="Incompatible join types: x.bool (logical) and i.int64 (integer64)")
test(2044.43, dt1[dt2, on = "char==int64"], error="Incompatible join types: x.char (character) and i.int64 (integer64)")
test(2044.44, dt1[dt2, on = "fact==int64"], error="Incompatible join types: x.fact (factor) and i.int64 (integer64)")
test(2044.45, dt1[dt2, on = "int64==bool"], error="Incompatible join types: x.int64 (integer64) and i.bool (logical)")
test(2044.46, dt1[dt2, on = "int64==char"], error="Incompatible join types: x.int64 (integer64) and i.char (character)")
test(2044.47, dt1[dt2, on = "int64==fact"], error="Incompatible join types: x.int64 (integer64) and i.fact (factor)")
}
# coercion for join
cols = c("x.bool","x.int","x.doubleInt","i.bool","i.int","i.doubleInt","i.char")
test(2044.60, dt1[dt2, ..cols, on="int==doubleInt", verbose=TRUE],
data.table(x.bool=FALSE, x.int=1:5, x.doubleInt=as.double(1:5), i.bool=TRUE, i.int=1:5, i.doubleInt=1:5, i.char=letters[1:5]),
output="Coercing double column i.doubleInt (which contains no fractions) to type integer to match type of x.int")
test(2044.61, dt1[dt2, ..cols, on="int==realDouble", verbose=TRUE], # this was wrong in v1.12.2 (the fractions were truncated and joined to next lowest int)
data.table(x.bool=c(NA,FALSE,NA,FALSE,NA), x.int=INT(NA,1,NA,2,NA), x.doubleInt=c(NA,1,NA,2,NA),
i.bool=TRUE, i.int=1:5, i.doubleInt=as.double(1:5), i.char=letters[1:5]),
output="Coercing integer column x.int to type double to match type of i.realDouble which contains fractions")
test(2044.62, dt1[dt2, ..cols, on="doubleInt==int", verbose=TRUE],
data.table(x.bool=FALSE, x.int=1:5, x.doubleInt=as.double(1:5), i.bool=TRUE, i.int=1:5, i.doubleInt=as.double(1:5), i.char=letters[1:5]),
output="Coercing integer column i.int to type double for join to match type of x.doubleInt")
test(2044.63, dt1[dt2, ..cols, on="realDouble==int", verbose=TRUE],
data.table(x.bool=c(rep(FALSE,4),TRUE), x.int=INT(2,4,6,8,10), x.doubleInt=c(2,4,6,8,10), i.bool=TRUE, i.int=1:5, i.doubleInt=as.double(1:5), i.char=letters[1:5]),
output="Coercing integer column i.int to type double for join to match type of x.realDouble")
cols = c("x.int","x.char","x.fact","i.int","i.char","i.char")
test(2044.64, dt1[dt2, ..cols, on="char==fact", verbose=TRUE],
ans<-data.table(x.int=1:5, x.char=letters[1:5], x.fact=factor(letters[1:5]), i.int=1:5, i.char=letters[1:5], i.char=letters[1:5]),
output="Coercing factor column i.fact to type character to match type of x.char")
test(2044.65, dt1[dt2, ..cols, on="fact==char", verbose=TRUE],
ans,
output="Matching character column i.char to factor levels in x.fact")
if (test_bit64) {
cols = c("x.int","x.doubleInt","x.realDouble","x.int64","i.int","i.doubleInt","i.realDouble","i.int64")
# int64 in i
test(2044.66, dt1[dt2, ..cols, on="int==int64", nomatch=0L, verbose=TRUE],
ans<-data.table(x.int=1:4, x.doubleInt=as.double(1:4), x.realDouble=c(0.5,1.0,1.5,2.0), x.int64=as.integer64(1:4),
i.int=1:4, i.doubleInt=as.double(1:4), i.realDouble=c(0.5,1.0,1.5,2.0), i.int64=as.integer64(1:4)),
output = "Coercing integer column x.int to type integer64 to match type of i.int64")
test(2044.67, dt1[dt2, ..cols, on="doubleInt==int64", nomatch=0L, verbose=TRUE],
ans,
output = "Coercing double column x.doubleInt (which contains no fractions) to type integer64 to match type of i.int64")
test(2044.68, dt1[dt2, ..cols, on="realDouble==int64", nomatch=0L, verbose=TRUE],
error="Incompatible join types: i.int64 is type integer64 but x.realDouble is type double and contains fractions")
# int64 in x
test(2044.69, dt1[dt2, ..cols, on="int64==int", nomatch=0L, verbose=TRUE],
ans<-data.table(x.int=1:5, x.doubleInt=as.double(1:5), x.realDouble=c(0.5,1.0,1.5,2.0,2.5), x.int64=as.integer64(1:5),
i.int=1:5, i.doubleInt=as.double(1:5), i.realDouble=c(0.5,1.0,1.5,2.0,2.5), i.int64=as.integer64(1:5)),
output = "Coercing integer column i.int to type integer64 to match type of x.int64")
test(2044.70, dt1[dt2, ..cols, on="int64==doubleInt", nomatch=0L, verbose=TRUE],
ans,
output = "Coercing double column i.doubleInt (which contains no fractions) to type integer64 to match type of x.int64")
test(2044.71, dt1[dt2, ..cols, on="int64==realDouble", nomatch=0L, verbose=TRUE],
error="Incompatible join types: x.int64 is type integer64 but i.realDouble is type double and contains fractions")
}
# coercion of all-NA
dt1 = data.table(a=1, b=NA_character_)
dt2 = data.table(a=2L, b=NA)
test(2044.80, dt1[dt2, on="a==b", verbose=TRUE], data.table(a=NA, b=NA_character_, i.a=2L),
output=msg<-"Coercing all-NA i.b (logical) to type double to match type of x.a")
test(2044.81, dt1[dt2, on="a==b", nomatch=0L, verbose=TRUE], data.table(a=logical(), b=character(), i.a=integer()),
output=msg)
test(2044.82, dt1[dt2, on="b==b", verbose=TRUE], data.table(a=1, b=NA, i.a=2L),
output=msg<-"Coercing all-NA i.b (logical) to type character to match type of x.b")
test(2044.83, dt1[dt2, on="b==b", nomatch=0L, verbose=TRUE], data.table(a=1, b=NA, i.a=2L),
output=msg)
test(2044.84, dt1[dt2, on="b==a", verbose=TRUE], data.table(a=NA_real_, b=2L, i.b=NA),
output=msg<-"Coercing all-NA x.b (character) to type integer to match type of i.a")
test(2044.85, dt1[dt2, on="b==a", nomatch=0L, verbose=TRUE], data.table(a=double(), b=integer(), i.b=logical()),
output=msg)
# natural join #629
d1 = data.table(id1=rep(1L,3), id2=2:4, v1=1:3)
d2 = data.table(id1=rep(1L,3), id2=3:5, v2=3:1)
ans = data.table(id1=rep(1L, 3), id2=3:5, v1=c(2:3,NA_integer_), v2=3:1)
test(2045.01, d1[d2], error="columns to join by must be specified")
test(2045.02, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]")
test(2045.03, d1[d2, on=.(id1,id2)], ans)
test(2045.04, d1[d2, on=.(id1,id2), nomatch=NULL], ans[1:2])
test(2045.05, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]")
test(2045.06, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]")
test(2045.07, d1[d2, nomatch=NULL, on=.NATURAL, verbose=TRUE], ans[1:2], output="natural join using: [id1, id2]")
setkey(d1, id1)
test(2045.08, nrow(d1[d2, allow.cartesian=TRUE]), 9L) # join
test(2045.09, d1[d2, on=.NATURAL, verbose=TRUE], ans, output="natural join using: [id1, id2]") # ignore key when on=.NATURAL
setkey(d1, NULL)
setnames(d2, c("a","b","c"))
test(2045.10, d1[d2, on=.NATURAL], error="Attempting to do natural join but no common columns in provided tables")
d2 = data.table(id1=2:4, id2=letters[3:5], v2=3:1)
test(2045.11, d1[d2, on=.(id1,id2)], error="Incompatible join types: x.id2 (integer) and i.id2 (character)")
test(2045.12, d1[d2, on=.NATURAL, verbose=TRUE], output="natural join", error="Incompatible join types: x.id2 (integer) and i.id2 (character)")
test(2045.13, d1[d1, on=.NATURAL, verbose=TRUE], d1, output="natural join using all 'x' columns")
d1 = setDT(replicate(20L, 1L, simplify = FALSE))
d2 = copy(d1[ , 1:15])
setnames(d2, 1L, 'X1')
test(2045.14, d1[d2, on=.NATURAL, verbose=TRUE], cbind(d1, X1 = d2$X1), output="natural join using: \\[.*[.]{3}\\]")
#tests for adding key to as.data.table, #890
## as.data.table.numeric (should cover as.data.table.factor,
## *.ordered, *.integer, *.logical, *.character, and *.Date since
## all are the same function in as.data.table.R)
nn = c(a=0.1, c=0.2, b=0.3, d=0.4)
ans = data.table(nn, key='nn')
ans_rn = data.table(rn = names(nn), nn, key='rn')
test(2046.01, as.data.table(nn, key="nn"), ans)
test(2046.02, as.data.table(nn, keep.rownames=TRUE, key="rn"), ans_rn)
## as.data.table.data.frame
DF = as.data.frame(ans)
test(2046.03, as.data.table(DF, key="nn"), ans)
## as.data.table.data.table
DT = copy(ans)
test(2046.04, as.data.table(DT, key="nn"), ans)
## as.data.table.default
rr <- as.raw(3:1)
test(2046.05, as.data.table(rr, keep.rownames=TRUE, key="rn"),
data.table(rn = paste0(1:3), x = rr, key='rn'))
## as.data.table.list
l = as.list(ans)
test(2046.06, as.data.table(l, key='nn'), ans)
## as.data.table.matrix
mm <- as.matrix(ans)
test(2046.07, as.data.table(mm, key='nn'), ans)
## as.data.table.array
aa = array(nn, c(1L, 2L, 2L))
test(2046.08, as.data.table(aa, key='V3'),
data.table(V1 = 1L, V2 = rep(1:2, 2L), V3 = rep(1:2, each = 2L), value = as.vector(nn), key='V3'))
### conflict between sorted&key arguments
test(2046.09, as.data.table(aa, key='V3', sorted = TRUE), error="Please provide either 'key' or 'sorted'")
## as.data.table.table
tt <- as.table(nn)
test(2046.10, as.data.table(tt, key="N"),
data.table(V1 = names(tt), N = as.vector(tt), key='N'))
# some coverage tests uncovered by #890
test(2047.1, as.data.table(list(character(0L))), data.table(V1 = character(0L)))
test(2047.2, as.data.table(list()), data.table(NULL))
test(2047.3, as.data.table(rbind(1L)), data.table(V1 = 1L))
mm = rbind(1:2)
colnames(mm) = c('a', '')
test(2047.4, as.data.table(mm), data.table(a = 1L, V2 = 2L))
# recyle internal error; #3543
DT = data.table(A=1:3, existingCol=list(0,1,2))
test(2048.1, DT[, newCol:=.(rep(0, .N), rep(1, .N))], # was ok before when assigning to new column
error="Supplied 2 items to be assigned to 3 items of column 'newCol'.*please use rep")
test(2048.2, DT[, existingCol:=.(rep(0, .N), rep(1, .N))], # was internal error (rather than helpful error) when assigning to existing column
error="Supplied 2 items to be assigned to 3 items of column 'existingCol'.*please use rep")
# subassign to an embedded data.table in a 1-row data.table; #3474
inner = data.table(a=1:3, b=1:3)
outer = data.table(aa=1, ab=list(inner))
old = options(datatable.verbose=TRUE)
test(2049.1, outer$ab[[1]]$b <- 4L, 4L, # also tests the incorrect warning has gone
notOutput="revised")
options(old)
test(2049.2, outer$ab, list(data.table(a=1:3, b=4L)))
test(2049.3, outer$ab[[1]][, b := 5L], data.table(a=1:3, b=5L))
test(2049.4, outer$ab, list(data.table(a=1:3, b=5L)))
# rbindlist zero row DT should retain its (unused) levels, #3508
DT = data.table(f = factor(c("a", "b", "c")))
test(2050.1, rbind(DT[1], DT[1])[,levels(f)], c("a","b","c")) # ok before (unused levels when nrow>0 were retained)
test(2050.2, rbind(DT[1], DT[0])[,levels(f)], c("a","b","c")) # ok before
test(2050.3, rbind(DT[0], DT[1])[,levels(f)], c("a","b","c")) # ok before
test(2050.4, rbind(DT[0], DT[0])[,levels(f)], c("a","b","c")) # now ok again (only when nrow=0 were unused levels dropped)
test(2050.5, rbindlist(list(DT[0], DT[0]))[,levels(f)], c("a","b","c")) # now ok again
test(2050.6, rbind(DT[1], data.table(f=factor(letters[10:11]))[0])[,levels(f)], c("a","b","c","j","k")) # now includes "j","k" again
# coverage tests
## abusing -.Date, format.data.table, rleidv
test(2051.1, `-.IDate`(structure(0, class="Date"), 1L), structure(-1, class="Date"))
test(2051.2, `-.IDate`(1L, 1L), error = 'only subtract from "IDate"')
test(2051.3, format.data.table(1L), error = 'Possibly corrupt data.table')
test(2051.4, rleidv(prefix = 1L), error = "'prefix' must be NULL or")
## passing Date to second argument of as.POSIXct.ITime
t = as.ITime(0L)
test(2051.5, as.POSIXct(t, structure(0L, class="Date")), .POSIXct(0, 'UTC'))
## forder of dynamic expression
DT = data.table(a = 1:3)
test(2051.6, DT[order(sin(a/pi))], DT)
# rbindlist ordered factor with an NA level, #3601
dt1 = structure(list(V1 = c("2016", "2016", "2016", "2016", "2016"),
V46 = structure(c(3L, 1L, 1L, NA, 3L), .Label = c("Low",
"Typical", "High", NA), class = c("ordered", "factor"))), class = c("data.table", "data.frame"), row.names = c(NA, -5L))
dt2 = structure(list(V1 = c("2018", "2018", "2018", "2018", "2018")), row.names = c(NA, -5L), class = c("data.table", "data.frame"))
test(2052, rbindlist(list(dt1, dt2), fill=TRUE),
data.table(V1=c(dt1$V1, dt2$V1),
V46=structure(c(3L,1L,1L,NA,3L,NA,NA,NA,NA,NA), .Label=c("Low","Typical","High",NA), class = c("ordered", "factor"))))
# rbind.data.frame didn't respect integer storage mode of IDate, #2008
DF = data.frame(date = as.IDate(0L))
test(2053.1, storage.mode(rbind(DF, DF)$date), 'integer')
test(2053.2, DF$date[1L] <- integer(), integer())
# forder detected more generally in i, #1921
DT = data.table(
A = c(2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L),
B = c("b", "c", "a", "b", "b", "b", "c", "a", "b", "a"),
C = c(2L, 3L, 5L, 8L, 6L, 1L, 4L, 9L, 10L, 7L)
)
test(2054, DT[order(C)[1:5], B, verbose=TRUE], c('b', 'b', 'c', 'c', 'a'),
output = "forder.c received 10 rows and 1 column")
# ITime class should be retained for same methods as IDate, #3628
test(2055.1, seq(from = as.ITime('00:00:00'), to = as.ITime('00:00:05'), by = 5L), as.ITime(c(0, 5L)))
test(2055.2, c(as.ITime(0L), as.ITime(1L)), as.ITime(c(0L, 1L)))
test(2055.3, mean(as.ITime(c(0L, 0L))), as.ITime(0L))
# as.data.table.array some of dimnames are NULL, #3636
a = array(1:8, dim=c(2L,2L,2L), dimnames=list(NULL, NULL, as.character(1:2)))
ans = data.table(V1=c(1L,1L,1L,1L,2L,2L,2L,2L),
V2=c(1L,1L,2L,2L,1L,1L,2L,2L),
V3=c("1","2","1","2","1","2","1","2"),
value=c(1L,5L,3L,7L,2L,6L,4L,8L),
key=c("V1","V2","V3"))
test(2056, as.data.table(a), ans)
# unpack columns which are data.frame; aside in #3369 too.
DF = structure(list(a=1:3, b=data.frame(foo=4:6, bar=7:9)), row.names=1:3, class="data.frame")
DT = as.data.table(DF)
test(2057.1, ncol(DT), 3L)
test(2057.2, DT[2], data.table(a=2L, b.foo=5L, b.bar=8L))
DF = structure(list(a=list(c("a","b"), c("a","b"), c("a","b")), b=data.frame(foo=1:3, bar=4:6)), row.names=1:3, class="data.frame")
# A list being first is needed to mimic the jsonlite case. With this passing, test.R works from https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752
DT = as.data.table(DF)
test(2057.3, DT, data.table(a=list(c("a","b"), c("a","b"), c("a","b")), b.foo=1:3, b.bar=4:6))
# one and two+ row cases of data.table, as.data.table and cbind involving list columns, given
# the change to tests 1613.571-3 in PR#3471 in v1.12.4
# in v1.12.2 and before :
# data.table( data.table(1:2), list(c("a","b"),"a") )
# V1 V2 NA
# <int> <char> <char>
# 1: 1 a a
# 2: 2 b a
# i.e. passing a data.table() to data.table() changed the meaning of list() which was inconsistent,
# and an NA column name was introduced too (a bug in itself)
# from v1.12.4 :
# V1 V2
# <int> <list>
# 1: 1 a,b
# 2: 2 a
# i.e. now easier to add the list column as intended, and it's consistent with
# basic (i.e. not cbind-like) usage of data.table()
# # changed in v1.12.4 ?
ans = data.table(V1=1, V2=2) # --------------------
test(2058.01, data.table( data.table(1), 2), ans) # no
test(2058.02, as.data.table(list(data.table(1), 2)), ans) # no
test(2058.03, cbind(data.table(1), 2), ans) # no
ans = data.table(V1=1, V2=list(2)) # 'basic' usage; i.e. not cbind-like
test(2058.04, sapply(ans, class), c(V1="numeric", V2="list")) # no
test(2058.05, data.table( data.table(1), list(2) ), ans) # yes
test(2058.06, as.data.table(list(data.table(1), list(2))), ans) # yes
test(2058.07, cbind(data.table(1), list(2)), ans) # yes
ans = data.table(V1=1:2, V2=list(c("a","b"),"a"))
test(2058.08, sapply(ans, class), c(V1="integer", V2="list")) # no
test(2058.09, data.table( data.table(1:2), list(c("a","b"),"a") ), ans) # yes
test(2058.10, as.data.table(list(data.table(1:2), list(c("a","b"),"a"))), ans) # yes
test(2058.11, cbind(data.table(1:2), list(c("a","b"),"a")), ans) # yes
test(2058.12, cbind(first=data.table(A=1:3), second=data.table(A=4, B=5:7)),
data.table(first.A=1:3, second.A=4, second.B=5:7)) # no
test(2058.13, cbind(data.table(A=1:3), second=data.table(A=4, B=5:7)),
data.table(A=1:3, second.A=4, second.B=5:7)) # no
test(2058.14, cbind(data.table(A=1,B=2),3), data.table(A=1,B=2,V2=3)) # no
L = list(1:3, 4:6)
test(2058.15, as.data.table(L), data.table(V1=1:3, V2=4:6)) # no
# retain all-blank list names as batchtools relies on in reg$defs[1,job.pars], #3581
names(L) = c("","")
test(2058.16, as.data.table(L), setnames(data.table(1:3, 4:6),c("",""))) # no
# retain existing duplicate and blank names of a plain-list, just as 1.12.2 did
L = list(1:3, 4:6, 7:9, 10:12)
names(L) = c("","foo","","foo")
test(2058.17, as.data.table(L),
setnames(data.table(1:3, 4:6, 7:9, 10:12),c("","foo","","foo"))) # no
L = list(1:3, NULL, 4:6)
test(2058.18, length(L), 3L)
test(2058.19, as.data.table(L), data.table(V1=1:3, V2=4:6)) # V2 not V3 # no
DT = data.table(a=1:3, b=c(4,5,6))
test(2058.20, DT[,b:=list(NULL)], data.table(a=1:3)) # no
# rbindlist improved error message, #3638
DT = data.table(a=1)
test(2059.1, rbindlist(list(DT,1)), error="Item 2 of input is not a data.frame, data.table or list")
test(2059.2, rbindlist(DT), error="Input is data.table but should be a plain list of items to be stacked")
# rbindlist a list of data.frame which has a class though; e.g. trackdf::example(conversions)
# trackdf does inherit from 'list (i.e. c("ltraj","list")), we just require is.list() to be true.
L = structure(list(data.frame(A=1:3, B=7:9), data.frame(A=4:6, B=10:12)),
class="customClass") # don't inherit from list as a stronger test
test(2059.3, is.list(L))
test(2059.4, rbindlist(L), data.table(A=1:6, B=7:12))
# does not retain customClass consistent with v1.12.2; for safety as its attributes likely aren't appropriate for the new object
# fcoalesce, #3424
bool = c(TRUE, NA, FALSE)
bool_val = c(TRUE, TRUE, FALSE)
int = c(1L, 2L, NA_integer_, 4L)
int_val = 1:4
num = c(1, 2, NA_real_, 4)
num_val = c(1, 2, 3, 4)
str = c('a', NA_character_, 'b', NA_character_)
str_val = c('a', 'b', 'b', 'b')
fkt = factor(str)
fkt_val = factor(str_val)
date = as.Date(int, origin="1970-01-01")
date_val = as.Date(int_val, origin="1970-01-01")
idate = as.IDate(int, origin="1970-01-01")
idate_val = as.IDate(int_val, origin="1970-01-01")
itime = as.ITime(int)
itime_val = as.ITime(int_val)
posix = as.POSIXct(int, origin="1970-01-01")
posix_val = as.POSIXct(int_val, origin="1970-01-01")
# singleton replacements
test(2060.001, fcoalesce(bool, TRUE), bool_val)
test(2060.002, fcoalesce(bool, NA, TRUE), bool_val)
test(2060.003, fcoalesce(int, 3L), int_val)
test(2060.004, fcoalesce(int, NA_integer_, 3L), int_val)
test(2060.005, fcoalesce(num, 3), num_val)
test(2060.006, fcoalesce(num, NA_real_, 3), num_val)
test(2060.007, fcoalesce(str, 'b'), str_val)
test(2060.008, fcoalesce(str, NA_character_, 'b'), str_val)
test(2060.009, fcoalesce(fkt, factor('b', levels = c('a', 'b'))), fkt_val)
test(2060.010, fcoalesce(fkt, factor(NA_integer_, levels=c("a","b")), factor('b', levels = c('a', 'b'))), fkt_val)
test(2060.011, fcoalesce(date, as.Date("1970-01-04")), date_val)
test(2060.012, fcoalesce(date, as.Date(NA), as.Date("1970-01-04")), date_val)
test(2060.013, fcoalesce(idate, as.IDate("1970-01-04")), idate_val)
test(2060.014, fcoalesce(idate, as.IDate(NA), as.IDate("1970-01-04")), idate_val)
test(2060.015, fcoalesce(itime, as.ITime(3L)), itime_val)
test(2060.016, fcoalesce(itime, as.ITime(NA), as.ITime(3L)), itime_val)
test(2060.017, fcoalesce(posix, as.POSIXct(3L, origin="1970-01-01")), posix_val)
test(2060.018, fcoalesce(posix, as.POSIXct(NA_integer_, origin="1970-01-01"), as.POSIXct(3L, origin="1970-01-01")), posix_val)
# vector replacements
test(2060.051, fcoalesce(bool, rep(TRUE, 3L)), bool_val)
test(2060.052, fcoalesce(bool, rep(NA, 3L), rep(TRUE, 3L)), bool_val)
test(2060.053, fcoalesce(int, rep(3L, 4L)), int_val)
test(2060.054, fcoalesce(int, rep(NA_integer_, 4L), rep(3L, 4L)), int_val)
test(2060.055, fcoalesce(num, rep(3, 4L)), num_val)
test(2060.056, fcoalesce(num, rep(NA_real_, 4L), rep(3, 4L)), num_val)
test(2060.057, fcoalesce(str, rep('b', 4L)), str_val)
test(2060.058, fcoalesce(str, rep(NA_character_, 4L), rep('b', 4L)), str_val)
test(2060.059, fcoalesce(fkt, factor(rep('b', 4L), levels=c('a', 'b'))), fkt_val)
test(2060.060, fcoalesce(fkt, factor(rep(NA_integer_, 4L), levels=c("a","b")), factor(rep('b', 4L), levels=c('a', 'b'))), fkt_val)
test(2060.061, fcoalesce(date, rep(as.Date("1970-01-04"), 4L)), date_val)
test(2060.062, fcoalesce(date, rep(as.Date(NA), 4L), rep(as.Date("1970-01-04"), 4L)), date_val)
test(2060.063, fcoalesce(idate, rep(as.IDate("1970-01-04"), 4L)), idate_val)
test(2060.064, fcoalesce(idate, rep(as.IDate(NA), 4L), rep(as.IDate("1970-01-04"), 4L)), idate_val)
test(2060.065, fcoalesce(itime, rep(as.ITime(3L), 4L)), itime_val)
test(2060.066, fcoalesce(itime, rep(as.ITime(NA), 4L), rep(as.ITime(3L), 4L)), itime_val)
test(2060.067, fcoalesce(posix, as.POSIXct(rep(3L, 4L), origin="1970-01-01")), posix_val)
test(2060.068, fcoalesce(posix, as.POSIXct(rep(NA_integer_, 4L), origin="1970-01-01"), as.POSIXct(rep(3L, 4L), origin="1970-01-01")), posix_val)
test(2060.101, fcoalesce(bool, list(NA, TRUE)), bool_val)
# floating point extras
x = c(11L, NA, 13L, NA, 15L, NaN, NA, NA, NA)+0.1
y = c(NA, 12L, 5L, NA, NA, 16L, NaN, Inf, NA)+0.1
z = c(11L, NA, 1L, 14L, NA, 16L, 1L, 2L, NA)+0.1
test(2060.151, fcoalesce(x, y, z), ans<-c(11:16,1,Inf,NA)+0.1)
test(2060.152, fcoalesce(list(x, y, z)), ans)
test(2060.153, fcoalesce(x, list(y,z)), ans)
test(2060.154, fcoalesce(list(x)), x)
test(2060.155, setcoalesce(list(x)), x)
test(2060.156, setcoalesce(list(x,y,z)), ans)
test(2060.157, x, ans) # setcoalesce updated the first item (x) by reference
# factor of different levels
x = factor(c('a','b',NA,NA,'b'))
y = factor(c('b','b','a',NA,'b'))
z = factor(c('a',NA,NA,'d','a'))
test(2060.180, fcoalesce(x, y, z), error="Item 3 is a factor but its levels are not identical to the first item's levels")
# edge cases/checks
test(2060.201, fcoalesce(bool), bool)
test(2060.202, fcoalesce(fkt), fkt)
test(2060.203, fcoalesce(bool, 1L), error='Item 2 is type integer but the first item is type logical. Please coerce before coalescing')
test(2060.204, fcoalesce(bool, NA_integer_), error='Item 2 is type integer but the first item is type logical.')
test(2060.205, fcoalesce(fkt, 1L), error='Item 1 is a factor but item 2 is not a factor. When factors are involved, all items must be factor')
test(2060.206, fcoalesce(num, 3L), error='Item 2 is type integer but the first item is type double')
test(2060.207, fcoalesce(int, 3), error='Item 2 is type double but the first item is type integer')
test(2060.208, fcoalesce(fkt, 'b'), error='Item 1 is a factor but item 2 is not a factor. When factors are involved, all items must be factor.')
test(2060.209, fcoalesce(str, factor('b')), error='Item 2 is a factor but item 1 is not a factor. When factors are involved, all items must be factor')
test(2060.212, fcoalesce(list(1), list(2)), error="The first argument is a list, data.table or data.frame. In this case there should be no other arguments provided.")
test(2060.213, fcoalesce(bool, c(TRUE, FALSE)), error="Item 2 is length 2 but the first item is length 3. Only singletons are recycled")
test(2060.214, fcoalesce(as.raw(0), as.raw(1)), error="Unsupported type: raw")
test(2060.215, fcoalesce(bool, list()), bool)
test(2060.216, fcoalesce(structure(c(1:2,NA,4L), class=c("a")), c(NA,NA,3L,4L)),, error="Item 2 has a different class than item 1")
# different classes of x arg #3660
x = c(11L, NA, 13L, NA, 15L, NA)
y = c(NA, 12L, 5L, NA, NA, NA)
z = c(11L, NA, 1L, 14L, NA, NA)
ans = c(11L,12L,13L,14L,15L,NA_integer_)
test(2060.250, fcoalesce(list(x, y, z)), ans)
test(2060.251, fcoalesce(data.frame(x, y, z)), ans)
test(2060.252, fcoalesce(data.table(x, y, z)), ans)
# integer64 tests
if (test_bit64) {
int64 = as.integer64(int)
int64_val = as.integer64(1:4)
test(2060.301, as.character(fcoalesce(int64, as.integer64(3))), as.character(int64_val)) # why as.character see nanotime tests below
test(2060.302, as.character(fcoalesce(int64, as.integer64(NA), as.integer64(3))), as.character(int64_val))
test(2060.303, as.character(fcoalesce(int64, as.integer64(rep(3, 4L)))), as.character(int64_val))
test(2060.304, fcoalesce(int64, 1), error='Item 2 has a different class than item 1')
test(2060.305, fcoalesce(int64, 1L), error = 'Item 2 is type integer but the first item is type double')
}
# nanotime tests
if (test_nanotime) {
nt = nanotime(int)
nt_val = nanotime(1:4)
test(2060.401, as.character(fcoalesce(nt, nanotime(3L))), as.character(nt_val)) # as.character due to eddelbuettel/nanotime#46
test(2060.402, as.character(fcoalesce(nt, nanotime(NA), nanotime(3L))), as.character(nt_val))
test(2060.403, as.character(fcoalesce(nt, nanotime(rep(3, 4L)))), as.character(nt_val))
test(2060.404, fcoalesce(nt, 1), error='Item 2 has a different class than item 1')
test(2060.405, fcoalesce(nt, 1L), error = 'Item 2 is type integer but the first item is type double')
}
# setcoalesce
x = c(11L, NA, 13L, NA, 15L, NA)
y = c(NA, 12L, 5L, NA, NA, NA)
z = c(11L, NA, 1L, 14L, NA, NA)
xx = copy(x)
xx_addr = address(xx)
setcoalesce(xx, y, z)
test(2060.501, xx_addr, address(xx))
test(2060.502, xx, c(11:15, NA))
xx = copy(x)
xx_addr = address(xx)
setcoalesce(xx, list())
test(2060.503, xx_addr, address(xx))
test(2060.504, xx, x)
test(2060.505, address(setcoalesce(xx)), xx_addr)
# complex support for fcoalesce
z1 = c(1i, NA, 1-1i, NA, 0+3i, NA)
z2 = c(NA, 4-2i, 0+0i, NA, NA, NA)
z3 = c(2, NA, 3+6i, 5-1i, NA, NA)
na_idx = c(2L, 4L, 6L)
test(2060.600, fcoalesce(z1, 0+0i), `[<-`(z1, na_idx, 0+0i))
test(2060.601, fcoalesce(z1, z2), `[<-`(z1, na_idx, c(4-2i, NA, NA)))
test(2060.602, fcoalesce(z1, z2, z3), `[<-`(z1, na_idx, c(4-2i, 5-1i, NA)))
z_addr = address(z1)
setcoalesce(z1, z2, z3)
test(2060.603, address(z1), z_addr)
test(2060.604, z1, `[<-`(z1, na_idx, c(4-2i, 5-1i, NA)))
test(2060.605, fcoalesce(NULL, "foo"), NULL) # as seen in mlr using BBmisc::coalesce from example(getHyperPars), #3581
# #3650 -- ensure max nrow check on CJ is applied after unique
l = replicate(ceiling(log10(.Machine$integer.max)), rep(1L, 10L), simplify = FALSE)
l$unique = TRUE
test(2061, do.call(CJ, l), data.table(V1=1L, V2=1L, V3=1L, V4=1L, V5=1L, V6=1L, V7=1L, V8=1L, V9=1L, V10=1L, key=paste0("V",1:10)))
# #3635, not specific to non-equi joins, but only occurs in non-equi cases.
d1 = data.table(a = c(1L, 6L), b = c(11L, 16L))
d2 = data.table(r = 1:5, s = seq(0L, 20L, 5L))
test(2062.1, d1[d2, on = .(a <= s, b >= s), j = .SD], ans<-data.table(a=INT(0,5,10,10,15,20), b=INT(0,5,10,10,15,20)))
test(2062.2, d1[d2, on = .(a <= s, b >= s)][, .(a, b)], ans)
# #3664 -- explicitly test some commonly-used partial argument matches, see ?options
old = options(warnPartialMatchArgs=FALSE,
warnPartialMatchAttr=FALSE,
warnPartialMatchDollar=FALSE)
## id --> id.vars, measure --> measure.vars
DT <- data.table(
i_1 = c(1:17, NA),
i_2 = c(NA, 18:34),
f_1 = factor(c('a', 'c', 'b', NA, 'c', 'b', 'c', 'c', NA, 'c', NA, 'c', 'a', 'b', NA, NA, NA, 'a')),
c_1 = c("a", "c", NA, NA, NA, "c", "b", NA, "a", "b", NA, "a", "c", "b", "c", "b", "a", "b")
)
test(2063.1, melt(DT, id=1:2, measure=3:4), melt(DT, id=c("i_1", "i_2"), measure=c("f_1", "c_1")))
## fun --> fun.aggregate
DT = melt(as.data.table(ChickWeight), id.vars=2:4)
setnames(DT, tolower(names(DT)))
test(2063.2, dcast(DT, time ~ variable, fun=sum)[c(1,2,11,.N)], data.table(time=c(0,2,20,21), weight=c(2053,2461,9647,9841), key="time"))
## keep --> keep.rownames
x <- 1:5
setattr(x, 'names', letters[1:5])
test(2063.3, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x)))
## ignore --> ignore.empty
ll = list(1:2, NULL, 3:4)
test(2063.4, transpose(ll, ignore=TRUE), list(c(1L, 3L), c(2L, 4L)))
options(old)
# integer / double Date merge should retain attributes, #3679
dbl_date = structure(17896.0, class = "Date")
int_date = structure(17896L, class = "Date")
x = data.table(date = int_date, value = 10, key = 'date')
i = data.table(date = dbl_date, key = 'date')
test(2064.1, x[i, class(date), verbose=TRUE], 'Date',
output="Coercing double column i.date (which contains no fractions) to type integer to match type of x.date")
test(2064.2, i[x, class(date), verbose=TRUE], 'Date',
output="Coercing integer column i.date to type double for join to match type of x.date")
# complex values in grouping, #3639
set.seed(42)
DT = CJ(x = 1:10, a = c("a", "b"), b = 1:2)
DT[ , z := complex(rnorm(1:.N), rnorm(1:.N))]
## can simplify this test after #1444
test(2065.1, all.equal(setkey(copy(DT), NULL), DT[, .(x = x, z = z), by = .(a, b)][order(x, a, b)], ignore.col.order = TRUE))
test(2065.2, DT[ , base::sum(z), by = a], data.table(a = c('a', 'b'), V1 = c(5.0582228485073+0i, -1.8644229822705+0i)))
test(2065.3, DT[ , sum(Mod(z)), by = b], data.table(b = 1:2, V1 = c(16.031422657932, 13.533483145656)))
## mimicking test 171.3 for coverage
x = data.table(A=c(25L,85L,25L,25L,85L), B=c("a","a","b","c","c"), z=0:4 + (4:0)*1i)
test(2065.4, x[ , data.table(A, z)[A==25, z] + data.table(A, z)[A==85, z], by=B],
data.table(B = c('a', 'c'), V1 = c(1, 7) + (c(7, 1))*1i))
## mimicking test 771 for coverage
a = data.table(first=1:6, third=c(1i,1,1i,3,3i,4), key="first")
b = data.table(first=c(3,4,4,5,6,7,8), second=1:7, key="first")
test(2065.5, b[ , third:=a[b, third, by=.EACHI]], error="Supplied 2 items to be assigned to 7 items of")
# also works for assignment, as noted in #3690
DT[ , z_sum := base::sum(z), by = .(a, b)]
test(2065.6, DT[ , z_sum := base::sum(z), by = .(a, b)][1:3, z_sum],
c(1.8791864549242+0i, 3.17903639358309+0i, -4.18868631527035+0i))
test(2065.7, DT[1L, z_sum := 1i][1L, z_sum], 1i)
# GForce for complex columns, part of #3690
DT = data.table(id=c(1L,1L,2L), v=c(1i, 2i, 3i))
test(2066.01, DT[, min(v), by=id], error="'complex' has no well-defined min")
test(2066.02, DT[, max(v), by=id], error="'complex' has no well-defined max")
test(2066.03, DT[, head(v, 1), by=id], data.table(id=1:2, V1=c(1, 3)*1i))
test(2066.04, DT[, tail(v, 1), by=id], data.table(id=1:2, V1=(2:3)*1i))
test(2066.05, DT[, v[2], by=id], data.table(id = 1:2, V1=c(2i, NA)))
## former test 1968.1
DT = data.table(A=1:5, B=-3i, C=2147483647L)
test(2066.06, DT[, .(sum(B), mean(B)), by=A%%2L], data.table(A=1:0, V1=c(-9i, -6i), V2=-3i))
test(2066.07, DT[2:4, .(sum(B), mean(B)), by=A%%2L], data.table(A=0:1, V1=c(-6i, -3i), V2=-3i))
test(2066.08, DT[4, B:=NA]$B, c(-3i,-3i,-3i,NA,-3i))
test(2066.09, DT[, .(sum(B), mean(B)), by=A%%2L], data.table(A=1:0, V1=c(-9i, NA), V2=c(-3i, NA)))
test(2066.10, DT[2:4, .(sum(B), mean(B)), by=A%%2L], data.table(A=0:1, V1=c(NA, -3i), V2=c(NA, -3i)))
test(2066.11, DT[, .(sum(B, na.rm=TRUE), mean(B, na.rm=TRUE)), by=A%%2L], data.table(A=1:0, V1=c(-9i, -3i), V2=-3i))
test(2066.12, DT[2:4, .(sum(B, na.rm=TRUE), mean(B, na.rm=TRUE)), by=A%%2L], data.table(A=0:1, V1=c(-3i, -3i), V2=-3i))
# Shift complex values, part of #3690
z = c(1:3) + c(3:1)*1i
test(2067.1, shift(z), c(NA, z[1:2]))
test(2067.2, shift(z, type = 'lead'), c(z[2:3], NA))
test(2067.3, shift(z, fill = 1i), c(1i, z[1:2]))
test(2067.4, shift(list(z, 1:3)), list(c(NA, z[1:2]), c(NA, 1:2)))
# support for ordering tables with complex columns, #1444
DT = data.table(a = 2:1, z = complex(0, 0:1))
test(2068.1, setkey(copy(DT), a), data.table(a=1:2, z=complex(0, 1:0), key='a'))
test(2068.2, DT[ , abs(z), by=a], data.table(a=2:1, V1=c(0, 1)))
# raw continues not to be supported
DT = data.table(ID=2:1, r=as.raw(0:1))
test(2068.3, setkey(DT, ID), error="Item 2 of list is type 'raw'")
# setreordervec triggers !isNewList branch for coverage
test(2068.4, setreordervec(DT$r, order(DT$ID)), error="reorder accepts vectors but this non-VECSXP")
# forderv (and downstream functions) handles complex vector input, part of #3690
DT = data.table(
a = c(1L, 1L, 8L, 2L, 1L, 9L, 3L, 2L, 6L, 6L),
b = c(3+9i, 10+5i, 8+2i, 10+4i, 3+3i, 1+2i, 5+1i, 8+1i, 8+2i, 10+6i),
c = 6
)
test(2069.01, DT[order(a, b)], DT[base::order(a, b)])
test(2069.02, DT[order(a, -b)], DT[base::order(a, -b)])
test(2069.03, forderv(DT$b, order = 1L), base::order(DT$b))
test(2069.04, forderv(DT$b, order = -1L), base::order(-DT$b))
test(2069.05, forderv(DT, by = 2:1), forderv(DT[ , 2:1]))
test(2069.06, forderv(DT, by = 2:1, order = c(1L, -1L)), DT[order(b, -a), which = TRUE])
# downstreams of forder
DT = data.table(
z = c(0, 0, 1, 1, 2, 3) + c(1, 1, 2, 2, 3, 4)*1i,
grp = rep(1:2, 3L),
v = c(3, 1, 4, 1, 5, 9)
)
unq_z = 0:3 + (1:4)*1i
test(2069.07, DT[ , .N, by=z], data.table(z=unq_z, N=c(2L, 2L, 1L, 1L)))
test(2069.08, DT[ , .N, keyby = z], data.table(z=unq_z, N=c(2L, 2L, 1L, 1L), key='z'))
test(2069.09, dcast(DT, z ~ grp, value.var='v', fill=0),
data.table(z=unq_z, `1`=c(3, 4, 5, 0), `2`=c(1, 1, 0, 9), key='z'))
test(2069.10, frank(DT$z), c(1.5, 1.5, 3.5, 3.5, 5, 6))
test(2069.11, frank(DT$z, ties.method='max'), c(2L, 2L, 4L, 4L, 5L, 6L))
test(2069.12, frank(-DT$z, ties.method='min'), c(5L, 5L, 3L, 3L, 2L, 1L))
test(2069.13, DT[ , rowid(z, grp)], rep(1L, 6L))
test(2069.14, DT[ , rowid(z)], c(1:2, 1:2, 1L, 1L))
test(2069.15, rleid(c(1i, 1i, 1i, 0, 0, 1-1i, 2+3i, 2+3i)), rep(1:4, c(3:1, 2L)))
# handling doubles properly
test(2069.16, rleid(c(1i, 1.1i)), 1:2)
test(2069.17, rleidv(DT, "z"), c(1L, 1L, 2L, 2L, 3L, 4L))
test(2069.18, unique(DT, by = 'z'), data.table(z = unq_z, grp = c(1L, 1L, 1L, 2L), v = c(3, 4, 5, 9)))
test(2069.19, unique(DT, by = 'z', fromLast = TRUE), data.table(z = unq_z, grp = c(2L, 2L, 1L, 2L), v = c(1, 1, 5, 9)))
test(2069.20, uniqueN(DT$z), 4L)
# setkey, setorder work
DT = data.table(a = 2:1, z = 0 + (1:0)*1i)
test(2069.21, setkey(copy(DT), z), data.table(a=1:2, z=0+ (0:1)*1i, key='z'))
test(2069.22, setorder(DT, z), data.table(a=1:2, z=0+ (0:1)*1i))
## assorted coverage tests from along the way
if (test_bit64) {
test(2069.23, is.sorted(as.integer64(10:1)), FALSE)
test(2069.24, is.sorted(as.integer64(1:10)))
}
# sort by vector outside of table
ord = 3:1
test(2069.25, forder(data.table(a=3:1), ord), 3:1)
# dogroups.c coverage
test(2069.26, data.table(c='1')[ , expression(1), by=c], error="j evaluates to type 'expression'")
test(2069.27, data.table(c='1', d=2)[ , d := .(NULL), by=c], error='RHS of := is NULL during grouped assignment')
test(2069.28, data.table(c='1', d=2)[ , c(a='b'), by=c, verbose=TRUE], output='j appears to be a named vector')
test(2069.29, data.table(c = '1', d = 2)[ , .(a = c(nm='b')), by = c, verbose = TRUE], output = 'Column 1 of j is a named vector')
DT <- data.table(a = rep(1:3, each = 4), b = LETTERS[1:4], z = 0:3 + (4:1)*1i)
test(2069.30, DT[, .SD[3,], by=b], DT[9:12, .(b, a, z)])
DT = data.table(x=1:4,y=1:2,lgl=TRUE,key="x,y")
test(2069.31, DT[CJ(1:4,1:4), any(lgl), by=.EACHI]$V1,
c(TRUE, NA, NA, NA, NA, TRUE, NA, NA, TRUE, NA, NA, NA, NA, TRUE, NA, NA))
set.seed(45L)
DT1 = data.table(a = sample(3L, 15L, TRUE) + .1, b=sample(c(TRUE, FALSE, NA), 15L, TRUE))
DT2 = data.table(a = sample(3L, 6L, TRUE) + .1, b=sample(c(TRUE, FALSE, NA), 6L, TRUE))
test(2069.32, DT1[DT2, .(y = sum(b, na.rm=TRUE)), by=.EACHI, on=c(a = 'a', b="b")]$y, rep(0L, 6L))
DT = data.table(z = 1i)
test(2069.33, DT[DT, on = 'z'], error = "Type 'complex' not supported for joining/merging")
# forder verbose message when !isReallyReal Date, #1738
DT = data.table(d=sample(seq(as.Date("2015-01-01"), as.Date("2015-01-05"), by="days"), 20, replace=TRUE))
test(2070.01, typeof(DT$d), "double")
test(2070.02, DT[, .N, keyby=d, verbose=TRUE], output="Column 1.*date.*8 byte double.*no fractions are present.*4 byte integer.*to save space and time")
# coverage along with switch+default pairing
test(2071.01, dcast(data.table(id=1, grp=1, e=expression(1)), id ~ grp, value.var='e'), error="Unsupported column type in fcast val: 'expression'")
test(2071.02, is_na(data.table(expression(1))), error="Unsupported column type 'expression'")
test(2071.03, is_na(data.table(1L), 2L), error="Item 1 of 'cols' is 2 which is outside")
test(2071.04, is_na(list(1L, 1:2)), error="Column 2 of input list x is length 2, inconsistent")
test(2071.05, any_na(data.table(1L), 2L), error="Item 1 of 'cols' is 2 which is outside")
test(2071.06, any_na(list(1L, 1:2)), error="Column 2 of input list x is length 2, inconsistent")
test(2071.07, any_na(data.table(as.raw(0L))), FALSE)
test(2071.08, any_na(data.table(c(1+1i, NA))))
test(2071.09, any_na(data.table(expression(1))), error="Unsupported column type 'expression'")
test(2071.10, dcast(data.table(a=1, b=1, l=list(list(1))), a ~ b, value.var='l'),
data.table(a=1, `1`=list(list(1)), key='a'))
test(2071.11, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'),
data.table(a=1, `2`=3, key='a'))
# fifelse, #3657
test_vec = -5L:5L < 0L
test_vec_na = c(test_vec, NA)
out_vec = rep(1:0, 5:6)
out_vec_na = c(out_vec, NA_integer_)
test(2072.001, fifelse(test_vec, 1L, 0L), out_vec)
test(2072.002, fifelse(test_vec, 1, 0), as.numeric(out_vec))
test(2072.003, fifelse(test_vec, TRUE, FALSE), as.logical(out_vec))
test(2072.004, fifelse(test_vec, "1", "0"), as.character(out_vec))
test(2072.005, fifelse(test_vec_na, TRUE, NA), c(rep(TRUE,5L), rep(NA,7L)))
test(2072.006, fifelse(test_vec, rep(1L,11L), rep(0L,11L)), out_vec)
test(2072.007, fifelse(test_vec, rep(1L,11L), 0L), out_vec)
test(2072.008, fifelse(test_vec, 1L, rep(0L,11L)), out_vec)
test(2072.009, fifelse(test_vec, rep(1L,11L), rep(0L,10L)), error="Length of 'no' is 10 but must be 1 or length of 'test' (11).")
test(2072.010, fifelse(test_vec, rep(1,10L), rep(0,11L)), error="Length of 'yes' is 10 but must be 1 or length of 'test' (11).")
test(2072.011, fifelse(test_vec, rep(TRUE,10L), rep(FALSE,10L)), error="Length of 'yes' is 10 but must be 1 or length of 'test' (11).")
test(2072.012, fifelse(0:1, rep(TRUE,2L), rep(FALSE,2L)), error="Argument 'test' must be logical.")
test(2072.013, fifelse(test_vec, TRUE, "FALSE"), error="'yes' is of type logical but 'no' is of type character. Please")
test(2072.014, fifelse(test_vec, list(1),list(2,4)), error="Length of 'no' is 2 but must be 1 or length of 'test' (11).")
test(2072.015, fifelse(test_vec, list(1,3),list(2,4)), error="Length of 'yes' is 2 but must be 1 or length of 'test' (11).")
test(2072.016, fifelse(test_vec, list(1), list(0)), as.list(as.numeric(out_vec)))
test(2072.017, fifelse(test_vec, list(1), list(0)), as.list(as.numeric(out_vec)))
## Jan 1 - 5, 2011
date_vec = as.Date(14975:14979, origin = '1970-01-01')
test(2072.018, fifelse(date_vec == "2011-01-01", date_vec - 1L, date_vec),
c(date_vec[1L] - 1L, date_vec[2:5]))
test(2072.019, fifelse(c(TRUE,FALSE,TRUE,TRUE,FALSE), factor(letters[1:5]), factor("a", levels=letters[1:5])),
factor(c("a","a","c","d","a"), levels=letters[1:5]))
test(2072.020, fifelse(test_vec_na, 1L, 0L), out_vec_na)
test(2072.021, fifelse(test_vec_na, rep(1L,12L), 0L), out_vec_na)
test(2072.022, fifelse(test_vec_na, rep(1L,12L), rep(0L,12L)), out_vec_na)
test(2072.023, fifelse(test_vec_na, 1L, rep(0L,12L)), out_vec_na)
test(2072.024, fifelse(test_vec_na, 1, 0), as.numeric(out_vec_na))
test(2072.025, fifelse(test_vec_na, rep(1,12L), 0), as.numeric(out_vec_na))
test(2072.026, fifelse(test_vec_na, rep(1,12L), rep(0,12L)), as.numeric(out_vec_na))
test(2072.027, fifelse(test_vec_na, 1, rep(0,12L)), as.numeric(out_vec_na))
test(2072.028, fifelse(test_vec_na, TRUE, rep(FALSE,12L)), as.logical(out_vec_na))
test(2072.029, fifelse(test_vec_na, rep(TRUE,12L), FALSE), as.logical(out_vec_na))
test(2072.030, fifelse(test_vec_na, rep(TRUE,12L), rep(FALSE,12L)), as.logical(out_vec_na))
test(2072.031, fifelse(test_vec_na, "1", rep("0",12L)), as.character(out_vec_na))
test(2072.032, fifelse(test_vec_na, rep("1",12L), "0"), as.character(out_vec_na))
test(2072.033, fifelse(test_vec_na, rep("1",12L), rep("0",12L)), as.character(out_vec_na))
test(2072.034, fifelse(test_vec_na, "1", "0"), as.character(out_vec_na))
test(2072.035, fifelse(test_vec, as.Date("2011-01-01"), FALSE), error="'yes' is of type double but 'no' is of type logical. Please")
test(2072.036, fifelse(test_vec_na, 1+0i, 0+0i), as.complex(out_vec_na))
test(2072.037, fifelse(test_vec_na, rep(1+0i,12L), 0+0i), as.complex(out_vec_na))
test(2072.038, fifelse(test_vec_na, rep(1+0i,12L), rep(0+0i,12L)), as.complex(out_vec_na))
test(2072.039, fifelse(test_vec_na, 1+0i, rep(0+0i,12L)), as.complex(out_vec_na))
test(2072.040, fifelse(test_vec, as.raw(0), as.raw(1)), error="Type raw is not supported.")
test(2072.041, fifelse(TRUE,1,as.Date("2019-07-07")), error="'yes' has different class than 'no'. Please")
test(2072.042, fifelse(TRUE,1L,factor(letters[1])), error="'yes' has different class than 'no'. Please")
test(2072.043, fifelse(TRUE, list(1:5), list(5:1)), list(1:5))
test(2072.044, fifelse(as.logical(NA), list(1:5), list(5:1)), list(NULL))
test(2072.045, fifelse(FALSE, list(1:5), list(5:1)), list(5:1))
test(2072.046, fifelse(TRUE, list(data.table(1:5)), list(data.table(5:1))), list(data.table(1:5)))
test(2072.047, fifelse(FALSE, list(data.table(1:5)), list(data.table(5:1))), list(data.table(5:1)))
test(2072.048, fifelse(TRUE, list(data.frame(1:5)), list(data.frame(5:1))), list(data.frame(1:5)))
test(2072.049, fifelse(FALSE, list(data.frame(1:5)), list(data.frame(5:1))), list(data.frame(5:1)))
test(2072.050, fifelse(c(TRUE,FALSE), list(1:5,6:10), list(10:6,5:1)), list(1:5,5:1))
test(2072.051, fifelse(c(NA,TRUE), list(1:5,6:10), list(10:6,5:1)), list(NULL,6:10))
test(2072.052, fifelse(c(FALSE,TRUE), list(1:5,6:10), list(10:6,5:1)), list(10:6,6:10))
test(2072.053, fifelse(c(NA,TRUE), list(1:5), list(10:6,5:1)), list(NULL,1:5))
test(2072.054, fifelse(c(NA,TRUE), list(1:5,6:10), list(5:1)), list(NULL,6:10))
test(2072.055, fifelse(c(FALSE,TRUE), list(TRUE), list(10:6,5:1)), list(10:6,TRUE))
test(2072.056, fifelse(c(FALSE,TRUE), list(as.Date("2019-07-07")), list(10:6,5:1)), list(10:6,as.Date("2019-07-07")))
test(2072.057, fifelse(c(FALSE,TRUE), list(factor(letters[1:5])), list(10:6,5:1)), list(10:6,factor(letters[1:5])))
test(2072.058, fifelse(c(NA,FALSE), list(1:5), list(10:6,5:1)), list(NULL,5:1))
test(2072.059, fifelse(c(NA,FALSE), list(1:5,6:10), list(5:1)), list(NULL,5:1))
test(2072.060, fifelse(c(NA,FALSE), list(1:5), list(5:1)), list(NULL,5:1))
test(2072.061, fifelse(c(TRUE,FALSE), list(1L), list(0L)), list(1L,0L))
test(2072.062, fifelse(c(TRUE,FALSE), list(1L), list(0L)), list(1L,0L))
test(2072.063, fifelse(c(TRUE,FALSE), factor(c("a","b")), factor(c("a","c"))), error="'yes' and 'no' are both type factor but their levels are different")
test(2072.064, fifelse(c(TRUE, TRUE, TRUE, FALSE, FALSE), factor(NA, levels=letters[1:5]), factor(letters[1:5])),
factor(c(NA,NA,NA,"d","e"),levels=letters[1:5]))
test(2072.065, fifelse(c(TRUE, TRUE, TRUE, FALSE, NA, FALSE), factor(NA, levels=letters[1:6]), factor(letters[1:6])),
factor(c(NA,NA,NA,"d",NA,"f"),levels=letters[1:6]))
test(2072.066, fifelse(c(TRUE, TRUE, TRUE, FALSE, NA, FALSE), factor(letters[1:6]), factor(NA, levels=letters[1:6])),
factor(c("a","b","c",NA,NA,NA), levels=letters[1:6]))
test(2072.067, fifelse(c(TRUE, NA, TRUE, FALSE, FALSE, FALSE), factor(NA), factor(NA)),
factor(c(NA,NA,NA,NA,NA,NA)))
DT = data.table(x=1:5, y=6:10)
test(2073.01, transpose(DT, keep.names="rn"),
ans<-data.table(rn=c('x','y'), V1=c(1L, 6L), V2=c(2L, 7L), V3=c(3L, 8L), V4=c(4L, 9L), V5=c(5L, 10L)))
test(2073.02, transpose(DT, keep.names=TRUE), error="either NULL.*name of the first column of the result")
test(2073.03, transpose(ans, make.names="rn"), DT)
test(2073.04, transpose(ans, make.names="notthere"), error="make.names='notthere' not found in names of input")
test(2073.05, transpose(ans, keep.names="rn", make.names="rn"), data.table(rn=paste0("V",1:5), x=1:5, y=6:10))
L = list(a=1:3, rn=LETTERS[1:3], b=4:6)
test(2073.06, transpose(L, make.names=0), error="make.names=0 is out of range [1,ncol=3]")
test(2073.07, transpose(L, make.names=4), error="make.names=4 is out of range [1,ncol=3]")
test(2073.08, transpose(L, make.names=NA), error="make.names=NA is out of range [1,ncol=3]")
test(2073.09, transpose(L, make.names=2), list(A=INT(1,4), B=INT(2,5), C=INT(3,6)))
test(2073.10, transpose(L, make.names=2, keep.names='foo'), list(foo=c("a","b"), A=INT(1,4), B=INT(2,5), C=INT(3,6)))
# 2074.* miscellaneous coverage to bring *.R to 100%; see comments in PR #3761
## i is NULL
x = NULL
test(2074.01, data.table(1:10)[x], data.table(NULL))
## auto-guessing of byvars when none of the columns have "normal" names
test(2074.02, data.table(`0`=0, `1`=1)[ , TRUE, by = .(`0` + `1`)], data.table(`0`=1, V1=TRUE))
## also eval.+ columns are OK, just not eval( patterns, #3758
evaluate = function(x) c('F', 'D', 'C', 'B', 'A')[findInterval(x, c(0, 60, 70, 80, 90, 100))]
test(2074.03, data.table(grade=c(50L, 91L, 95L, 51L, 89L))[ , .N, by=evaluate(grade)],
data.table(evaluate=c('F', 'A', 'B'), N=c(2L, 2L, 1L)))
## error: use recursive character list indexing to assign when also doing alloc.col()
opt = options(datatable.alloccol=1L)
l = list(foo = list(bar = data.table(a = 1:3, b = 4:6)))
test(2074.04, l[[c('foo', 'bar')]][ , (letters) := 16:18], error = 'under-allocated recursively indexed list')
options(opt)
## alloc.col when using 0-truelength j assigning to a subset
DT = data.table(a=1)
### construct incorrectly to have 0 truelength; follow-up: https://github.com/Rdatatable/data.table/pull/3791#discussion_r318326736
zDT = structure(list(b=2), class = c('data.table', 'data.frame'))
test(2074.05, DT[1L, b := zDT], data.table(a=1, b=2))
## nested .SD in j
DT = data.table(a=1, b=2)
test(2074.06, DT[ , c(.SD[1], .SD[1, .SD[1]]), by=a], data.table(a=1, b=2, b=2))
## as.matrix.data.table when a column has columns (only possible when constructed incorrectly)
DT = structure(list(a=1:5, d=data.table(b=6:10, c=11:15), m=matrix(16:25, ncol=2L)), class = c('data.table', 'data.frame'))
test(2074.07, as.matrix(DT), matrix(1:25, ncol=5L, dimnames=list(NULL, c('a', 'd.b', 'd.c', 'm.1', 'm.2'))))
## can induce !cedta() from base::rownames to get this error
test(2074.08, rownames(structure(list(1:5), class='data.table')), error="Has it been created manually")
## default dimnames.data.table
test(2074.09, dimnames(data.table(a = 1)), list(NULL, 'a'))
## unlock argument of .shallow
DT = data.table(a = 1)
setattr(DT, '.data.table.locked', TRUE)
test(2074.10, attr(.shallow(DT, unlock=TRUE), '.data.table.locked'), NULL)
## coverage of rowidv & rleidv
test(2074.11, rowidv(1:10, cols=1), error="x is a single vector, non-NULL 'cols'")
test(2074.12, rowidv(1:10), rep(1L, 10L))
test(2074.13, rowidv(list(1:10), cols=integer()), error="x is a list, 'cols' cannot be 0-length")
test(2074.14, rleidv(1:10, cols=1), error="x is a single vector, non-NULL 'cols'")
test(2074.15, rleidv(list(1:10), cols=integer()), error="x is a list, 'cols' cannot be 0-length")
## coverage of .prepareFastSubset
DT = data.table(V1=c('a', 'b', 'a'), V2 = c('hello', 'ello', 'llo'), x=TRUE)
test(2074.16, nrow(DT[!(V1=='a' & V2 %like% 'll')]), 1L)
y = c(TRUE, FALSE, FALSE)
test(2074.17, nrow(DT[x & y]), 1L)
setkey(DT, V1)
test(2074.18, DT[V1=='a', verbose=TRUE], output='Optimized subsetting with key')
# print.data.table
DT2 = data.table(a=1:101)
test(2074.19, length(capture.output(print(DT2, nrows=1i))), 12L)
test(2074.20, length(capture.output(print(DT2[-1L], nrows=1i))), 102L)
test(2074.21, length(capture.output(print(DT2, nrows=-1L))), 0L)
test(2074.22, length(capture.output(print(DT2, topn=1i))), 12L)
test(2074.23, capture.output(print(DT2, topn=1L, col.names='none')),
c(" 1: 1", " --- ", "101: 101"))
# foverlaps
x = data.table(start=NA_integer_, end=1L, key='start,end')
y = copy(x)
test(2074.24, foverlaps(x, y), error="NA values in data.table 'x' start column")
x[ , start := 0L]
setkey(x, start, end)
test(2074.25, foverlaps(x, y), error="NA values in data.table 'y' start column")
setkey(y, end, start)
test(2074.26, foverlaps(x, y), error="NA values in data.table 'y' end column")
# cube
test(2074.27, cube(DT, by=1L), error="Argument 'by' must be a character")
test(2074.28, cube(DT, by='a', id=1L), error="Argument 'id' must be a logical")
# groupingsets
test(2074.29, groupingsets(DT, .(grouping=max(1)), by='V1', sets=list('V1'), id=TRUE),
error="When using `id=TRUE` the 'j' expression must not evaluate to a column named 'grouping'")
# tstrsplit
test(2074.30, tstrsplit('a', names=1L), error="'names' must be TRUE/FALSE or a character vector")
# fcast with eval in fun.aggregate
DT[ , z := 0L]
test(2074.31, dcast(DT, V1 ~ z, fun.aggregate=eval(quote(length)), value.var='z'),
data.table(V1=c('a', 'b'), `0`=2:1,key='V1'))
# fwrite both logical args
test(2074.32, fwrite(DT, logical01=TRUE, logicalAsInt=TRUE), error="logicalAsInt has been renamed")
# merge.data.table
test(2074.33, merge(DT, DT, by.x = 1i, by.y=1i), error="A non-empty vector of column names is required")
# shift naming
test(2074.34, shift(list(a=1:5, b=6:10), give.names=TRUE), list(a_lag_1=c(NA, 1:4), b_lag_1=c(NA, 6:9)))
test(2074.35, shift(1:5, 1:2, give.names=TRUE), list(V1_lag_1=c(NA, 1:4), V1_lag_2=c(NA, NA, 1:3)))
# bmerge.c
x = data.table(a='a')
test(2074.36, bmerge(x, x, 1L, 1L, 0, FALSE, 0L, "all", '==', FALSE), error="rollends must be a length 2")
test(2074.37, bmerge(x, x, 1L, 1L, 'nearest', c(TRUE, FALSE), 0L, "all", 1L, FALSE), error="roll='nearest' can't be applied to a character")
# trigger reallocation on big non-equi-join
set.seed(384)
d = data.table(a=sample(150, 150, TRUE), b=1:150)
test(2074.38, nrow(d[d, on = .(a>a, b>b), allow.cartesian=TRUE]), 5722L)
# fread.c
## ok / quoted branch of parse_double_extended
test(2074.39, fread('a,b\n"Inf,2\n'), data.table(a='"Inf', b=2L), warning="Found and resolved")
## verbose output
test(2074.40, fread('a\n1', na.strings=character(), verbose=TRUE), output='No NAstrings provided')
test(2074.41, fread('a\n1', na.strings='9', verbose=TRUE), output='One or more of the NAstrings looks like a number')
# cbind 0 cols, #3334
test(2075, data.table(data.table(a=1), data.table()), data.table(data.table(a=1)))
# natural join using X[on=Y], #3621
X = data.table(a=1:2, b=1:2)
test(2076.01, X[on=.(a=2:3, d=2:1)], data.table(a=2:3, b=c(2L,NA_integer_), d=2:1))
Y = data.table(a=2:3, d=2:1)
test(2076.02, X[on=Y], data.table(a=2:3, b=c(2L,NA_integer_), d=2:1))
test(2076.03, X[on=3], X,
warning=c("When on= is provided but not i=, on= must be a named list or data.table|frame, and a natural join.*Ignoring on= which is 'numeric'",
"i and j are both missing so ignoring the other arguments. This warning will be upgraded to error in future."))
test(2076.04, X[on=list(3)], X,
warning=c("When on= is provided but not i=, on= must be a named list or data.table|frame, and a natural join.*Ignoring on= which is 'list'",
"i and j are both missing so ignoring the other arguments. This warning will be upgraded to error in future."))
# gsum int64 support #1647, #3464
if (test_bit64) {
d = data.table(g=1:2, i32=c(2L,-1L,3L,4L), i64=as.integer64(c(2L,-1L,3L,4L)))
int64_int32_match = function(x, y) isTRUE(all.equal(lapply(x, as.integer), lapply(y, as.integer)))
test(2077.01, int64_int32_match(d[, sum(i32), g], d[, sum(i64), g]))
test(2077.02, int64_int32_match(d[, sum(i32, na.rm=TRUE), g], d[, sum(i64, na.rm=TRUE), g]))
d[3L, c("i32","i64") := list(NA_integer_, as.integer64(NA))] # some NA group
test(2077.03, int64_int32_match(d[, sum(i32), g], d[, sum(i64), g]))
test(2077.04, int64_int32_match(d[, sum(i32, na.rm=TRUE), g], d[, sum(i64, na.rm=TRUE), g]))
d[1L, c("i32","i64") := list(NA_integer_, as.integer64(NA))] # all NA group
test(2077.05, int64_int32_match(d[, sum(i32), g], d[, sum(i64), g]))
test(2077.06, int64_int32_match(d[, sum(i32, na.rm=TRUE), g], d[, sum(i64, na.rm=TRUE), g]))
}
# between NAs in character bounds #3667
test(2078.01, letters %between% c("m", NA_character_), c(rep(FALSE, 12L), rep(TRUE, 14L)))
test(2078.02, between(letters, "m", NA_character_, incbounds=TRUE), c(rep(FALSE, 12L), rep(TRUE, 14L)))
test(2078.03, between(letters, "m", NA_character_, incbounds=FALSE), c(rep(FALSE, 13L), rep(TRUE, 13L)))
test(2078.04, between(letters, NA_character_, "m", incbounds=TRUE), c(rep(TRUE, 13L), rep(FALSE, 13L)))
test(2078.05, between(letters, NA_character_, "m", incbounds=FALSE), c(rep(TRUE, 12L), rep(FALSE, 14L)))
test(2078.06, between(letters, NA_character_, NA_character_, incbounds=TRUE), rep(TRUE, 26L))
test(2078.07, between(letters, NA_character_, NA_character_, incbounds=FALSE), rep(TRUE, 26L))
test(2078.11, between(c("a","b","c"), c("a","b"), "d"), error="Incompatible vector lengths.*3.*2.*1")
test(2078.12, between(c("a","b","c"), "a", c("b","d")), error="Incompatible vector lengths.*3.*2.*1")
test(2078.13, between(c("a","b","c"), c("a","b"), c("b","d")), error="Incompatible vector lengths.*3.*2.*2")
test(2078.21, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=TRUE), c(FALSE, TRUE, TRUE))
test(2078.22, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=FALSE), c(FALSE, TRUE, FALSE))
test(2078.23, between(c("a","c","e"), c("b","c","d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE))
test(2078.24, between(c("a","c","e"), c("b","c","d"), NA, incbounds=FALSE), c(FALSE, FALSE, TRUE))
test(2078.25, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE))
test(2078.26, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=FALSE), c(FALSE, TRUE, TRUE))
test(2078.27, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE))
test(2078.28, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=FALSE), c(FALSE, TRUE, FALSE))
test(2078.29, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=TRUE), c(TRUE, TRUE, TRUE))
test(2078.30, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=FALSE), c(TRUE, TRUE, FALSE))
test(2078.31, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=TRUE, NAbounds=NA), c(NA, NA, NA))
test(2078.32, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=FALSE, NAbounds=NA), c(NA, NA, FALSE))
# between NA bound as unknown rather than unlimited, #3522
test(2079.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=NA), c(FALSE, FALSE, NA, NA, NA))
test(2079.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, TRUE, TRUE))
test(2079.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=FALSE), error="NAbounds must be TRUE or NA")
# nanotime support
if (test_nanotime) {
n=nanotime(1:4)
n[2L]=NA
op = options(datatable.verbose=TRUE)
test(2080.01, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64")
test(2080.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64")
test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64")
options(op)
test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes")
test(2080.05, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes")
}
# use raw type to cover fallback to R in between.R
old = options(datatable.verbose=TRUE)
test(2081.01, between(as.raw(1:5), as.raw(2), as.raw(4)), c(FALSE, TRUE, TRUE, TRUE, FALSE), output="fallback to slow R")
test(2081.02, between(as.raw(1:5), as.raw(2), as.raw(4), incbounds=FALSE), c(FALSE, FALSE, TRUE, FALSE, FALSE), output="fallback to slow R")
options(old)
test(2081.03, between(3i, NA, NA), error="Not yet implemented NAbounds=TRUE for this non-numeric and non-character type")
# new check in 1.12.4 that lower<=upper when check=TRUE (FALSE by default), #3838
test(2082.01, between(1:3, INT(2,3,4), 2L), ans<-c(FALSE, FALSE, FALSE))
test(2082.02, between(1:3, INT(2,3,4), 2L, check=TRUE), error="Item 2 of lower (3) is greater than item 1 of upper (2)")
test(2082.03, between(as.double(1:3), c(2,3,4), 2), ans)
test(2082.04, between(as.double(1:3), c(2,3,4), 2, check=TRUE), error="Item 2 of lower [(]3.*[)] is greater than item 1 of upper [(]2.*[)]")
if (test_bit64) {
test(2082.05, between(as.integer64(1:3), as.integer64(2), as.integer64(1)), ans)
test(2082.06, between(as.integer64(1:3), as.integer64(2), as.integer64(1), check=TRUE), error="Item 1 of lower (2) is greater than item 1 of upper (1)")
}
test(2082.07, between(letters[1:2], c("foo","bar"), c("bar")), c(FALSE,FALSE))
test(2082.08, between(letters[1:2], c("foo","bar"), c("bar"), check=TRUE), error="Item 1 of lower ('foo') is greater than item 1 of upper ('bar')")
test(2082.09, between(as.raw(1:5), as.raw(3), as.raw(2), check=TRUE), error="Some lower>upper for this non-numeric and non-character type")
test(2082.10, between(1:3, 2, 4, check=NA), error="check must be TRUE or FALSE")
# partial instantiation of integer64 column was creating NA_REAL, not INT64_MIN
if (test_bit64) {
# sub-assign from #3723
test(2083.1, data.table(x=1:2)[1, y := as.integer64(0L)]$y, as.integer64(c(0L, NA)))
# rbindlist(fill=TRUE) from #1459
test(2083.2, rbind(data.table(a=1:2, b=as.integer64(c(1,NA))), data.table(a=3L), fill=TRUE)$b, as.integer64(c(1, NA, NA)))
}
# groupingsets j=.N by character(0) set #3653
d = data.table(x = c("a", "a", "b"))
test(2084.01, groupingsets(d, j = .N, by = "x", sets = list("x", character())), data.table(x=c("a","b",NA_character_), N=c(2L,1L,3L)))
test(2084.02, groupingsets(d, j = .N, by = "x", sets = list(character())), data.table(x=NA_character_, N=3L))
test(2084.03, groupingsets(d, j = .GRP, by = "x", sets = list("x", character())), data.table(x=c("a","b",NA_character_), GRP=c(1L,2L,1L)))
test(2084.04, groupingsets(d, j = .GRP, by = "x", sets = list(character())), data.table(x=NA_character_, GRP=1L))
test(2084.05, groupingsets(d, j = .I, by = "x", sets = list("x", character())), data.table(x=c("a","a","b",rep(NA_character_,3L)), I=c(1:3,1:3)))
test(2084.06, groupingsets(d, j = .I, by = "x", sets = list(character())), data.table(x=rep(NA_character_,3L), I=1:3))
# fifelse #3740
if (test_bit64) {
i = as.integer64(1:4)+3e9
test(2085.01, fifelse(c(TRUE,FALSE,NA,TRUE), i, i+100), c(i[1L], i[2L]+100, as.integer64(NA), i[4]))
}
if (test_nanotime) {
n = nanotime(1:4)
test(2085.11, fifelse(c(TRUE,FALSE,NA,TRUE), n, n+100), c(n[1L], n[2L]+100, nanotime(NA), n[4]))
}
test(2085.21, fifelse(c(TRUE,FALSE,NA), 1:3, c(1,2,3)), c(1,2,NA))
test(2085.22, fifelse(c(TRUE,FALSE,NA), c(1,2,3), 1:3), c(1,2,NA))
test(2085.31, fifelse(c(a=TRUE,b=FALSE), list(m=1,n=2), list(x=11,y=12)), list(a=1, b=12))
test(2085.32, fifelse(c(a=TRUE,b=FALSE), c(m=1,n=2), c(x=11,y=12)), c(a=1, b=12))
test(2085.33, ifelse(c(a=TRUE,b=FALSE), c(1,2), c(11,12)), c(a=1, b=12)) # just to detect breaking change in base R
# empty 'by' still valid #3270
DT = data.table(a = 1:10)
test(2086.01, DT[ , sum(a), by = .()], data.table(V1=55L))
test(2086.02, DT[ , sum(a), keyby = .()], data.table(V1=55L))
test(2086.03, DT[ , sum(a), by = list()], data.table(V1=55L))
test(2086.04, DT[ , sum(a), keyby = list()], data.table(V1=55L))
test(2086.05, DT[ , sum(a), by = character()], data.table(V1=55L))
test(2086.06, DT[ , sum(a), keyby = character()], data.table(V1=55L))
# simple queries can create tables with columns sharing the same address, #3766
# these tests were new in 1.12.4 dev. In late stage before release 2087.1 was changed to avoid the share for #3890
# when #617 is done it will change back to being the same address
x = data.table(a=1L, b=c(1L, 4L, 2L, 3L), c=4:1)
test(2087.01, x[a == 1L, .(b, b2=b)][ , address(b)!=address(b2)])
# setkey detects and copies shared address columns, #3496
x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE)
x$b = x$a
setDT(x)
test(2087.02, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), key="a"),
output='Found and copied 1 column with a shared memory address')
x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE)
x$b = x$a
x$c = x$a
setDT(x)
test(2087.03, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), c=paste0(1:2), key="a"),
output='Found and copied 2 columns with a shared memory address')
# follow-up from #3890; function body and variable in calling scope
f = function(flag=FALSE) {
dt1 = data.table(a = 1)
dt2 = data.table(a = 1)
dt3 = dt1[, .(a, b = 0)] # (**)
if (flag) dt3[dt2, b := 999, on = "a"]
gsub(" ","",as.character(body(f)[[4L]])) # (**) above; remove spaces just to isolate from potential future formatting changes in R
}
test(2087.10, f(TRUE), c("=","dt3","dt1[,.(a,b=0)]")) # was "dt1[,.(a,b=999)]" in v1.12.2
value = 0
dt1 = data.table(a = 1)
dt2 = dt1[, .(a, b = ..value)]
dt2[1, b := 999]
test(2087.11, value, 0) # was 999 in v1.12.2
# clear '.data.table.locked' even when is.null(irows), #2245
x = data.table(a=c(0.85, -0.38, 1.19), b=c(0.56, 0.63, -1.30))
test(2088, x[, round(.SD, 1)][, c:=8.88], data.table(a=c(.8, -.4, 1.2), b=c(.6,.6,-1.3), c=8.88))
# setDT should warn when it sees matrix columns, #3760
DF = data.frame(a=1:5)
DF$m = matrix(6:15, ncol=2L)
test(2089.1, names(setDT(DF)), c("a","m"),
warning="Some columns are a multi-column type.*[[]2[]].*setDT will retain these columns as-is but.*Please consider as.data.table")
test(2089.2, as.data.table(DF), data.table(a=1:5, m.V1=6:10, m.V2=11:15)) # the DF here is now a data.table, so as.data.table.data.table() is matrix-column-aware
test(2089.3, print(DF), output="<multi-column>")
DF = data.frame(a=1:5)
DF$m = matrix(6:15, ncol=2L, dimnames=list(NULL, c("foo","bar")))
test(2089.4, colnames(DF), c("a","m"))
test(2089.5, colnames(DF$m), c("foo","bar"))
test(2089.6, as.data.table(DF), data.table(a=1:5, m.foo=6:10, m.bar=11:15))
# a single matrix item is unambiguous and different to a matrix embedded alongside other items
M = list(matrix(1:6,ncol=1L))
test(2089.7, setDT(M), data.table(V1=1:6)) # worked in 1.12.2 and miceFast relies on, #3581
M = list(matrix(1:6,ncol=2L))
test(2089.8, setDT(M), data.table(V1=1:3, V2=4:6)) # returned a single column in v1.12.2, now from 1.12.4 returns 2 columns as expected
# get/mget in j should restore all columns to .SD, #1744 and #1965
DT = data.table(a=1, b=2, c=3)
test(2090.1, DT[, {list(a, get('b')); names(.SD)}, .SDcols='c'], 'c')
# Also incidentally fixed #2036 about reordering of columns in .SD
test(2090.2, DT[ , {get('c'); names(.SD)}, .SDcols = c('b', 'a')], c('b', 'a'))
# ditto for #2946
DT <- data.table(v1=1:2, v2 = 3:4, type = c('A', 'B'))
col_in <- c('v2', 'v1')
col_out <- paste0(col_in, '.new')
test(2090.3, DT[, (col_out) := lapply(.SD, function(x){x * min(x[get('type') == 'A'])}), .SDcols = col_in],
data.table(v1=1:2, v2=3:4, type=c('A', 'B'), v2.new=c(9L, 12L), v1.new=1:2))
# minus sign in j with with=FALSE shouldn't be interpreted as 'not', #2109
DT = data.table(a=1, b=2)
i = 2L
test(2091, DT[ , i-1L, with=FALSE], data.table(a=1))
# #1926 -- unlock .SD for bmerge in j
DT = data.table(id=1:2, v=3:4)
DT2 = data.table(id=1, x=5)
DT3 = copy(DT2)
## DT2.id is numeric so bmerge does coercion with set()
test(2092.1, DT[id == 1, DT2[.SD, on="id"]], data.table(id=1L, x=5, v=3L))
DT[id == 1, x := DT2[.SD, x, on="id"]]
DT[id == 1, x := 4]
test(2092.2, DT2$x, DT3$x)
df1 = data.table(a=1:5, b=c(0, 0, 1, 0, 2))
df2 = data.table(c=c(1, 1, 2, 2, 3), d=c(3, 4, 3, 5, 4))
test(2092.3, copy(df2)[ , s := df1[.SD, on=.(a >= c, a <= d), sum(b), by=.EACHI]$V1],
df2[ , s := c(1, 1, 1, 3, 1)])
# POSIXct overflow to NA before 1901 and after 2038, #3780
date=as.POSIXct("1900-01-01", tz="UTC")
test(2093.1, as.IDate(date), as.IDate(-25567L))
test(2093.2, hour(date), 0L)
test(2093.3, minute(date), 0L)
test(2093.4, second(date), 0L)
# correct rbindlist() idcol for list elements with different length, #3785
x = 1:100
X = rbindlist(lapply(x, function(.) list(., 1:2, 2:3)), idcol = 'TAG')
test(2094.01, X$TAG, rep(x, each = 2))
x = setNames(x, paste0("nm_", x))
X = rbindlist(lapply(x, function(.) list(., 1:2, 2:3)), idcol = 'TAG')
test(2094.02, X$TAG, rep(names(x), each = 2))
# use arbitrary column without message when fun.aggregate=length, #2980
DT = data.table(a=c(3L, 3L, 2L, 9L, 5L, 10L, 3L, 2L, 9L, 8L), b=rep(1:5, 2))
test(2095, any(grepl('override', capture.output(dcast(DT, a~b, fun.aggregate=length)), fixed=TRUE)), FALSE)
# gmean intermediate can overflow integers without warning, #986
test(2096, data.table(a=c(1L,1L), v=c(2e9L, 2e9L))[, mean(v), a], data.table(a=1L, V1=2e9))
# NA_character joining to NA factor, #3809
DT = data.table(fac = as.factor(c("a",NA,"b")), v=1:3, key="fac")
test(2097.1, DT[.(NA_character_)], data.table(fac=NA_character_, v=2L, key="fac"))
test(2097.2, DT[J(NA)], error="Factor columns must join to factor or character columns")
test(2097.3, DT[J(NA_integer_)], error="Factor columns must join to factor or character columns")
# Optimized order=>forder within do.call as done by neatRanges and rbi.helpers, PR#3817
# Fixes: 'Column 2 is length 3 which differs' and 'argument x is missing with no default'
DT = data.table(id=INT(3,5,2,4,2), rating=LETTERS[5:1])
groups = c("id", "rating")
test(2098.1, DT[do.call(order, mget(groups)), verbose=TRUE], ans<-data.table(id=INT(2,2,3,4,5), rating=c("A","C","E","B","D")),
output=out<-"forder.c received 5 rows and 2 columns")
test(2098.2, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, output=out)
test(2098.3, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out)
test(2098.4, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out)
old = options(datatable.optimize=0L)
test(2098.5, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c")
test(2098.6, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c")
test(2098.7, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out)
test(2098.8, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out)
options(old)
# Error in update join when joining on factor, #3559
d1 <- data.table(fac = factor(letters[1:4]), char = letters[1:4], val = c(1L, NA, 3L, NA))
d2 <- data.table(fac = factor(letters[1:4]), char = letters[1:4], val = 1:4)
test(2099, d1[is.na(val), val := d2[.SD, x.val, on = .(fac)]], data.table(fac=factor(letters[1:4]), char=letters[1:4], val=1:4))
# fifelse gains na= for missing value, #3753
test_vec_na = c(-5L:5L < 0L, NA)
out_vec_na = c(rep(1:0, 5:6), 2L)
date_vec_na = as.logical(c(as.Date(14975:14979, origin = '1970-01-01') == "2011-01-01", as.Date(NA)))
test(2100.01, fifelse(test_vec_na, 1L, 0L, 2L), out_vec_na)
test(2100.02, fifelse(test_vec_na, 1, 0, 2), as.numeric(out_vec_na))
test(2100.03, fifelse(test_vec_na, TRUE, FALSE, TRUE), as.logical(out_vec_na))
test(2100.04, fifelse(test_vec_na, "1", "0","2"), as.character(out_vec_na))
test(2100.05, fifelse(test_vec_na, 1+0i, 0+0i, 2+0i), as.complex(out_vec_na))
test(2100.06, fifelse(c(TRUE,FALSE,NA), list(1:5), list(5:1), list(15:11)), list(1:5,5:1,15:11))
test(2100.07, fifelse(test_vec_na, 1, 0, 2L), error = "'yes' is of type double but 'na' is of type integer. Please make sure that both arguments have the same type.")
test(2100.08, fifelse(test_vec_na, 1, 0, c(2,3)), error = "Length of 'na' is 2 but must be 1")
test(2100.09, fifelse(date_vec_na, as.Date("2019-08-31"), as.Date("2019-08-30"), as.Date("2019-08-29")), as.Date(c(18139, 18138, 18138, 18138, 18138, 18137), origin = '1970-01-01'))
test(2100.10, fifelse(date_vec_na, as.Date("2019-08-31"), as.Date("2019-08-30"), 18137), error = "'yes' has different class than 'na'. Please make sure that both arguments have the same class.")
test(2100.11, fifelse(c(TRUE,FALSE,TRUE,TRUE,FALSE,NA), factor(letters[1:6]), factor("a", levels=letters[1:6]), factor("f", levels=letters[1:6])), factor(c("a","a","c","d","a","f"), levels=letters[1:6]))
test(2100.12, fifelse(c(TRUE,FALSE,TRUE,TRUE,FALSE,NA), factor(letters[1:6]), factor("a", levels=letters[1:6]), factor("f", levels=letters[1:7])), error = "'yes' and 'na' are both type factor but their levels are different.")
test(2100.13, lapply(list(list(yes = 1, no = 2, na = 3), list(yes = 2, no = 4)), function(el) fifelse(c(TRUE, FALSE, NA), el$yes, el$no, el$na)), list(c(1,2,3),c(2,4,NA)))
test(2100.14, fifelse(c(T,F,NA),c(1,1,1),c(2,2,2),NA), c(1,2,NA))
# join 0-row i type mismatch ok to coerce, tcpl #3581
DT = data.table(id=1:3, v=4:6, key="id")
test(2101, DT[.(logical())], data.table(id=logical(), v=integer(), key="id"))
# basic test of cut.IDate to match cut.Date
ID = as.IDate(INT(
0, 192, 384, 576, 768, 961, 1153, 1345, 1537, 1729, 1921,
2113, 2305, 2497, 2689, 2882, 3074, 3266, 3458, 3650
))
D = as.Date(ID)
br = as.IDate(INT(c(0, 151, 243)))
test(2102.1, cut(ID, breaks = br), as.IDate(cut(D, breaks=br)))
test(2102.2, cut(ID, breaks = '1 year'), as.IDate(cut(D, breaks = '1 year')))
test(2102.3, cut(ID, breaks = '6 months'), as.IDate(cut(D, breaks = '6 months')))
# DT[ , !rep(FALSE, ncol(DT)), with=FALSE] would exit before doing !, #3013
test(2103, DT[ , !rep(FALSE, 2L), with=FALSE], DT)
# implicit difftime by group can choose different units but are later ignored, #3694 & #761 (among others)
del = c(0, 60, 3600, 86400)
DT = data.table(ID=1:4, t0=.POSIXct(0), t1=.POSIXct(del))
test(2104.1, DT[ , t1-t0, by=ID], data.table(ID=1:4, V1=as.difftime(del, units='secs')))
test(2104.2, DT[ , t1-t0, by=ID, verbose=TRUE], output='Note: forcing units="secs"')
test(2104.3, DT[ , t1-5, by=ID], data.table(ID=1:4, V1=.POSIXct(del-5)))
# test to prevent regression on #1983
DT = data.table(date = as.IDate(18148L) + 0:2)
eDT = data.table(e_date = as.IDate(18148L))
out = DT[1L][ , V1 := date]
# code from issue doesn't quite work as expected since setattr returns NULL; test this anyway
test(2105.1, DT[eDT, on=.(date >= e_date), {
target_date = setattr(if (e_date[1L] %in% x.date) e_date[1L] else max(x.date), "class", c("IDate", "Date"))
target_date
}, by=.EACHI], out)
# also test version closer to the intended code
test(2105.2, DT[eDT, on=.(date >= e_date), {
target_date = as.IDate(if (e_date[1L] %in% x.date) e_date[1L] else max(x.date))
target_date
}, by=.EACHI], out)
# test to prevent regression on #2041
# UTF-8 Hex representations of Chinese codepoints, see issue or use cat() in UTF locale to see
a1 <- data.table(
a = c("\U4E2D\U6587\U4E00", "\U4E2D\U6587\U4E8C", "\U4E2D\U6587\U4E09", "\U4E2D\U6587\U56DB"),
b = 1:12
)
test(2106, a1[a == "\U4E2D\U6587\U4E00"], data.table(a = "\U4E2D\U6587\U4E00", b = c(1L, 5L, 9L)))
# setnames() allows function arguments, #3703
DT = data.table(a=1:3, b=4:6, c=7:9)
setnames(DT, base::toupper)
test(2107.1, names(DT), c('A','B','C'))
setnames(DT, c('B','C'), function(x) sprintf('W_%s_W', x))
test(2107.2, names(DT), c('A','W_B_W','W_C_W'))
DT = data.table(a=1:3, b=4:6, c=7:9)
# support numeric old as well
setnames(DT, 1.0, toupper)
test(2107.3, names(DT), c('A','b','c'))
setnames(DT, -(1:2), toupper)
test(2107.4, names(DT), c('A','b','C'))
# first and last should no longer load xts namespace, #3857, below commented test for interactive validation when xts present but not loaded or attached
#stopifnot("xts"%in%installed.packages(), !"xts"%in%loadedNamespaces()); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!"xts" %in% loadedNamespaces())
x = as.POSIXct("2019-09-09")+0:1
old = options(datatable.verbose=TRUE)
test(2108.01, last(x), x[length(x)], output="!is.xts(x)")
test(2108.02, first(x), x[1L], output="!is.xts(x)")
if (test_xts) {
xt = xts(1:2, x)
test(2108.03, last(xt, 2L), xt, output="using xts::last: is.xts(x)")
test(2108.04, first(xt, 2L), xt, output="using xts::first: is.xts(x)")
xt = xts(matrix(1:4, 2L, 2L), x)
test(2108.05, last(xt, 2L), xt, output="using xts::last: is.xts(x)")
test(2108.06, first(xt, 2L), xt, output="using xts::first: is.xts(x)")
}
# first on empty df now match head(df, n=1L), #3858
df = data.frame(a=integer(), b=integer())
test(2108.11, first(df), df, output="!is.xts(x)")
test(2108.12, last(df), df, output="!is.xts(x)")
options(old)
# xts last-first dispatch fix #4053
x = 1:3
y = as.POSIXct(x, origin="1970-01-01")
df = data.frame(a=1:2, b=3:2)
dt = as.data.table(df)
mx = matrix(1:9, 3, 3)
ar = array(1:27, c(3,3,3))
xt = structure(
c(142.25, 141.229996, 141.330002, 142.860001, 142.050003, 141.399994,
140.570007, 140.610001, 140.380005, 141.369995, 141.669998, 140.539993,
94807600, 69620600, 76645300, 108.999954, 109.231255, 108.360008),
class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC",
index = structure(c(1167782400, 1167868800, 1167955200), tzone = "UTC", tclass = "Date"),
.Dim = c(3L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted"))
)
old = options(datatable.verbose=TRUE)
if (test_xts) {
test(2108.21, last(x, n=2L), 2:3, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
test(2108.22, last(y, n=2L), y[2:3], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
test(2108.23, last(x, n=1L), 3L, output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
test(2108.24, last(y, n=1L), y[3L], output="using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
xt_last = structure(
c(141.330002, 141.399994, 140.380005, 140.539993, 76645300, 108.360008),
class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC",
index = structure(1167955200, tzone = "UTC", tclass = "Date"),
.Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted"))
)
xt_last2 = structure(
c(141.229996, 141.330002, 142.050003, 141.399994, 140.610001, 140.380005,
141.669998, 140.539993, 69620600, 76645300, 109.231255, 108.360008),
class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC",
index = structure(c(1167868800, 1167955200), tzone = "UTC", tclass = "Date"),
.Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted"))
)
test(2108.25, last(xt), xt_last, output="using xts::last: is.xts(x)")
test(2108.26, last(xt, n=2L), xt_last2, output="using xts::last: is.xts(x)")
test(2108.31, first(x, n=2L), 1:2, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
test(2108.32, first(y, n=2L), y[1:2], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
test(2108.33, first(x, n=1L), 1L, output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
test(2108.34, first(y, n=1L), y[1L], output="using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()")
xt_first = structure(
c(142.25, 142.860001, 140.570007, 141.369995, 94807600, 108.999954),
class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC",
index = structure(1167782400, tzone = "UTC", tclass = "Date"),
.Dim = c(1L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted"))
)
xt_first2 = structure(
c(142.25, 141.229996, 142.860001, 142.050003, 140.570007, 140.610001, 141.369995, 141.669998, 94807600, 69620600, 108.999954, 109.231255),
class = c("xts", "zoo"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC",
index = structure(c(1167782400, 1167868800), tzone = "UTC", tclass = "Date"),
.Dim = c(2L, 6L), .Dimnames = list(NULL, c("SPY.Open", "SPY.High", "SPY.Low", "SPY.Close", "SPY.Volume", "SPY.Adjusted"))
)
test(2108.35, first(xt), xt_first, output="using xts::first: is.xts(x)")
test(2108.36, first(xt, n=2L), xt_first2, output="using xts::first: is.xts(x)")
} else {
test(2108.21, last(x, n=2L), 2:3, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
test(2108.22, last(y, n=2L), y[2:3], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
test(2108.23, last(x, n=1L), 3L, output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
test(2108.24, last(y, n=1L), y[3L], output="using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
test(2108.25, last(xt), error="you should have 'xts' installed already")
test(2108.26, last(xt, n=2L), error="you should have 'xts' installed already")
test(2108.31, first(x, n=2L), 1:2, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
test(2108.32, first(y, n=2L), y[1:2], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
test(2108.33, first(x, n=1L), 1L, output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
test(2108.34, first(y, n=1L), y[1L], output="using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()")
test(2108.35, first(xt), error="you should have 'xts' installed already")
test(2108.36, first(xt, n=2L), error="you should have 'xts' installed already")
}
test(2108.41, last(x), 3L, output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))")
test(2108.42, last(y), y[3L], output="using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))")
test(2108.51, first(x), 1L, output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))")
test(2108.52, first(y), y[1L], output="using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))")
test(2108.61, last(df), structure(list(a=2L, b=2L), row.names=2L, class="data.frame"), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)")
test(2108.62, last(dt), data.table(a=2L, b=2L), output="using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)")
test(2108.71, first(df), structure(list(a=1L, b=3L), row.names=1L, class="data.frame"), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)")
test(2108.72, first(dt), data.table(a=1L, b=3L), output="using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)")
# matrix/array utils::tail behavior is likely to change in future R, Michael is more in the topic
test(2108.81, last(mx), structure(c(3L, 6L, 9L), .Dim = c(1L, 3L), .Dimnames = list("[3,]", NULL)), output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
expected = if (base::getRversion() < "3.7.0") 27L else structure(c(3L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L), .Dim = c(1L, 3L, 3L), .Dimnames = list("[3,]", NULL, NULL)) #4127
test(2108.82, last(ar), expected, output="using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
test(2108.91, first(mx), structure(c(1L, 4L, 7L), .Dim = c(1L, 3L)), output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
expected = if (base::getRversion() < "3.7.0") 1L else structure(c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L), .Dim = c(1L, 3L, 3L)) #4127
test(2108.92, first(ar), expected, output="using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
options(old)
# error in autonaming by={...}, #3156
DT = data.table(State=c("ERROR", "COMPLETED", "ERROR"), ExitCode=c(1, 0, 2))
test(2109, DT[ , list(count=.N), by={list(State, ExitCode)}], DT[ , count := 1L])
# chmatchdup non-ascii, #3844
utf8 = c("\u00e7ile", "\u00de")
latin1 = iconv(utf8, from = "UTF-8", to = "latin1")
out = c(1L, 2L, NA, NA)
test(2110.1, chmatchdup(c(latin1, latin1), utf8), out)
test(2110.2, chmatchdup(c(utf8, utf8), latin1), out)
test(2110.3, chmatchdup(c(latin1, latin1), latin1), out)
test(2110.4, chmatchdup(c(utf8, utf8), utf8), out)
# setnames: names with same content but different encoding is considered as non-equal, #3845
utf8 = c("\u00e7ile", "\u00de")
latin1 = iconv(utf8, from = "UTF-8", to = "latin1")
tbl = as.data.table(setNames(list(1, 2), latin1))
test(2111.01, Encoding(names(tbl)), c('latin1', 'latin1'))
setnames(tbl, utf8)
test(2111.02, Encoding(names(tbl)), c('UTF-8', 'UTF-8'))
setnames(tbl, 1:2, latin1)
test(2111.03, Encoding(names(tbl)), c('latin1', 'latin1'))
setnames(tbl, latin1, utf8)
test(2111.04, Encoding(names(tbl)), c('UTF-8', 'UTF-8'))
# fwrite scipen, #2020
DT = data.table(a=0.0001, b=10^6, c=-20)
test(2112.01, fwrite(DT), output=out0<-c("a,b,c","1e-04,1e+06,-20")) # default scipen=0 is ensured in test.data.table()
test(2112.02, fwrite(DT, scipen=NULL), output=out0)
old = options(scipen=1)
test(2112.03, fwrite(DT), output=out1<-c("a,b,c","0.0001,1e+06,-20"))
test(2112.04, fwrite(DT, scipen=0), output=out0) # explicit scipen= overrides option
test(2112.05, fwrite(DT, scipen=NULL), output=out0) # NULL means 0, don't use the option's value
options(old)
test(2112.06, fwrite(DT, scipen=999), output=c("a,b,c","0.0001,1000000,-20"))
test(2112.07, fwrite(DT, scipen=1), output=out1)
test(2112.08, fwrite(DT, scipen=2), output=c("a,b,c","0.0001,1000000,-20"))
test(2112.09, fwrite(DT, scipen=-3), output=c("a,b,c","1e-04,1e+06,-20"))
test(2112.10, fwrite(DT, scipen=-4), output=c("a,b,c","1e-04,1e+06,-2e+01"))
test(2112.11, fwrite(DT, scipen=-999), output=c("a,b,c","1e-04,1e+06,-2e+01"))
DT = data.table(a=c(10^(((-4):4)*100),pi))
test(2112.12, fwrite(DT, scipen=0), output="a\n0\n1e-300\n1e-200\n1e-100\n1\n1e+100\n1e+200\n1e+300\nInf\n3.14159265358979")
test(2112.13, fwrite(DT, scipen=999), output=c(
"0",
"0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
"0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
"0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
"1",
"10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
"100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
"1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
"Inf",
"3.14159265358979"
))
# rbindlist segfault / malformed factor when recycling length-1 factor columns, #3662
test(2113.1, rbindlist(list(data.table(a=as.factor(1:2), b=as.factor(2:3)),
list(a=as.factor(3L), b=as.factor(4:5)))),
data.table(a=as.factor(INT(1,2,3,3)), b=as.factor(2:5)))
test(2113.2, rbindlist(list(list(a=as.factor(1:2), b=factor(), c=as.factor(3), d=as.factor(4)),
list(a="3", b=as.factor(3L), c=c("4","5"), d=character()))),
data.table(a=as.factor(c(1,2,3,3)), b=as.factor(c(NA,NA,3L,3L)), c=as.factor(c(3L,3:5)), d=as.factor(c(4L,4L,NA,NA))),
warning="Column 2 ['b'] of item 1 is length 0. This (and 1 other like it) has been filled with NA")
test(2113.3, rbindlist(list(list(a=numeric(), b=numeric()),
list(a=as.factor(1L), b=as.factor(2:4)))),
data.table(a=as.factor(1L), b=as.factor(2:4)))
# dogroups combine factor levels, #2199 & #2522
DT = data.table(A=1:2)
g = function(x) { if (x==1L) factor(c("a","b888")) else factor(c("b888","c")) } # b888 to cover tl==0 in memrecycle
test(2114.1, DT[,g(.GRP),by=A], data.table(A=INT(1,1,2,2), V1=as.factor(c("a","b888","b888","c"))))
g = function(x) { if (x==1L) factor(c("a","b")) else factor(c("a","b","c")) }
test(2114.2, DT[,g(.GRP),by=A], data.table(A=INT(1,1,2,2,2), V1=as.factor(c("a","b","a","b","c"))))
# original test verbatim from the same issue #2199
set.seed(2)
ids = sample(letters, 20)
dates = 1:40
dt = data.table(CJ(dates, ids, ids))
setnames(dt, c("date", "id1", "id2"))
dt[, value := rnorm(length(date))]
dt = dt[!(date == 1 & (id1 == "a" | id2 == "a"))]
dt = dt[!(date == 4 & (id1 == "e" | id2 == "e"))]
f1 = function(sdt) {
dt1 <- dcast.data.table(sdt, id1 ~ id2)
melt.data.table(dt1, id.vars = "id1")
}
res = dt[, f1(.SD), by=date]
test(2114.3, setnames(res[c(1,.N)],"variable","id2")[,id2:=as.character(id2)][], dt[c(1,.N)])
test(2114.4, print(res), output="date.*0.433")
# and from #2522
DT = data.table(id=1:9, grp=rep(1:3,each=3), val=c("a","b","c", "a","b","c", "a","b","c"))
test(2114.5, as.character(DT[, valfactor1 := factor(val), by = grp]$valfactor1), ans<-rep(c("a","b","c"),3))
test(2114.6, as.character(DT[, valfactor2 := factor(val), by = id]$valfactor2), ans)
DT = data.table(x = rep(letters[c(3, 1, 2)], each = 2))
test(2114.7, DT[, `:=`(g=.GRP, f=factor(.GRP)), by = x],
data.table(x=rep(c("c","a","b"),each=2), g=rep(1:3,each=2), f=factor(rep(as.character(1:3),each=2))))
# extra tests from #996 for completeness; no warning no-alloc coerce here of 0 and 1 numerics
DT = data.table(a=1:4, b=c(FALSE, TRUE, NA, FALSE))
test(2115.1, set(DT,3L,1L,0), data.table(a=INT(1,2,0,4), b=c(FALSE, TRUE, NA, FALSE)))
test(2115.2, set(DT,3L,2L,0), data.table(a=INT(1,2,0,4), b=c(FALSE, TRUE, FALSE, FALSE)))
test(2115.3, set(DT,3L,2L,-2L), data.table(a=INT(1,2,0,4), b=c(FALSE, TRUE, TRUE, FALSE)), # see also test 299
warning="-2.*integer.*position 1 taken as TRUE.*logical.*column 2 named 'b'")
test(2115.4, set(DT,4L,2L,3.14), data.table(a=INT(1,2,0,4), b=c(FALSE, TRUE, TRUE, TRUE)),
warning="3.14.*double.*position 1 taken as TRUE.*logical.*column 2 named 'b'")
DT = data.table(code=c("c","b","c","a"), val=10:13)
test(2115.5, DT[code=="c", val := val+1], data.table(code=c("c","b","c","a"), val=INT(11,11,13,13)))
DT = data.table(x=factor(LETTERS[1:3]), y=1:6)
test(2115.6, copy(DT)[4:6, x:=LETTERS[1:3]], DT) # identical(RHS, levels)
# allNA(); an aside in PR#3909
test(2116.01, !allNA(c("",NA)))
test(2116.02, allNA(NA_character_))
test(2116.03, allNA(as.character(c(NA,NA))))
test(2116.04, allNA(character())) # same as all(is.na(character()))
test(2116.05, !allNA(c(Inf,NA)))
test(2116.06, allNA(as.double(c(NA,NA))))
test(2116.07, allNA(double())) # same as all(is.na(double()))
if (test_bit64) {
test(2116.08, !allNA(as.integer64(c(NA,0))))
test(2116.09, allNA(as.integer64(c(NA,NA))))
test(2116.10, allNA(integer64())) # same as all(is.na(integer64()))
}
test(2116.11, !allNA(as.raw(c(0,0))))
test(2116.12, !allNA(as.raw(c(0,255))))
test(2116.13, allNA(raw())) # same as all(is.na(raw()))
test(2116.14, allNA(NULL)) # same as all(is.na(NULL))
test(2116.15, allNA(list())) # same as all(is.na(list()))
# turned off allNA list support for now to avoid accidentally using it internally where we did not intend; allNA not yet exported
# https://github.com/Rdatatable/data.table/pull/3909#discussion_r329065950
test(2116.16, allNA(list(NA, NA_character_)), error="Unsupported type 'list' passed to allNA") # base R returns true
test(2116.17, allNA(list(NA, NA)), error="Unsupported type 'list' passed to allNA") # base R returns true
test(2116.18, allNA(list(NA, c(NA,NA))), error="Unsupported type 'list' passed to allNA") # base R returns false
# don't create NA factor level when assigning to factor from character; bug in v1.12.2 noticed in dev part of PR#3909
DT = data.table(a=factor(LETTERS[1:3]))
test(2117.1, levels(DT[2:3,a:=c("",NA_character_)]$a), c("A","B","C",""))
test(2117.2, DT[1,a:=NA_character_]$a, factor(c(NA,"",NA), levels=c("A","B","C","")))
# assigning NA (non-character) to character column by group to trigger zero-copy-coerce case in memrecycle
# used to be inconvenient error ('Type of RHS must match LHS') in v1.12.2 and before, and user had to use NA_character_
DT = data.table(A=rep(1:2,each=3), B=3:4, v=letters[1:6])
test(2118.1, DT[B==3L,v:=NA,by=A]$v, c(NA,"b",NA,"d",NA,"f"))
test(2118.2, DT[,v:=NA,by=A]$v, rep(NA_character_,6L))
# adding list column containing lists to a one-row data.table, #3626
# tests 01-04 used to fail, now work
DT = data.table(a = 1)
list_column = list(list(a = 1, b = 2))
test(2119.01, DT$b <- list_column, list_column)
test(2119.02, DT, ans<-data.table(a=1, b=list(list(a=1, b=2))))
DT = data.table(a = 1)
test(2119.03, DT[, b:=list_column], ans)
test(2119.04, data.table(a=1L)[, newcol := list(list(2L, 3L))], data.table(a=1L, newcol=list(list(2L,3L))))
# extra tests 10-17 from Jan in the issue, all no change from 1.12.2
test(2119.10, data.table(a=1L)[, newcol := list(2L)], ans<-data.table(a=1L, newcol=2L))
test(2119.11, data.table(a=1L)[, newcol := 2L], ans)
test(2119.12, data.table(a=1:2)[, newcol := list(2L)], ans<-data.table(a=1:2, newcol=2L))
test(2119.13, data.table(a=1:2)[, newcol := 2L], ans)
test(2119.14, data.table(a=1L)[, newcol := list(list(2L))], data.table(a=1L, newcol=list(2L)))
test(2119.15, data.table(a=1L)[, newcol := list(2L, 3L)], error="Supplied 2 items to be assigned to 1 item")
test(2119.16, data.table(a=1:2)[, newcol := list(list(2L, 3L))], ans<-data.table(a=1:2, newcol=list(2L,3L)))
test(2119.17, data.table(a=1:2)[, newcol := list(2L, 3L)], ans)
# i symbol fetch from calling scope; #3669
iDT = data.table(key = "i_id",
i_id = c("A", "B", "C", "D"),
g = state.name[c(1,1,2,3)],
e_date = as.IDate(c("2019-01-20", "2019-01-20", "2019-01-01", "2019-01-01")),
e_time = as.ITime(c("14:00", "20:00", "20:00", "20:00"))
)
set.seed(1)
vDT = data.table(i_id = unique(iDT$i_id))[, .(v = runif(5,0,10), p = sample(c(5,5,10,10,10))), by=i_id]
test(2120.01, !exists("i_id")) # quick verify in case there's an i_id in .GlobalEnv when testing in dev
test(2120.02, iDT[i_id, order(e_date, e_time)], # first of all, the correct error
error="i_id is not found in calling scope but it is a column of type character")
tmp = vDT[c("B","C","A"), on=.(i_id), .N, by=.EACHI] # split long statement in 2120.05 up as per demo in #3669
test(2120.03, tmp, data.table(i_id=c("B","C","A"), N=5L)) # just make sure the helper tmp is correct
test(2120.04, tmp[iDT[i_id, order(e_date, e_time)]], # i_id obtained from tmp; this is what broke in dev 1.12.3
ans<-data.table(i_id=c("C","A","B"), N=5L))
test(2120.05, tmp[iDT[tmp$i_id, order(e_date, e_time)]], # same but explicit tmp$i_id
ans)
test(2120.06, vDT[c("B","C","A"), on=.(i_id), .N, by=.EACHI][iDT[i_id, order(e_date, e_time)]],
ans) # the original compound statement where i_id was the prior scope
test(2120.07, iDT[(i_id), order(e_date, e_time)], c(3L,4L,1L,2L)) # wrapping with () is different; uses character column to self-join
test(2120.08, tmp[iDT[(i_id), order(e_date, e_time)]], # different result with the NA
data.table(i_id=c("A",NA,"B","C"), N=c(5L,NA,5L,5L)))
# auto-name .() when it's the last item of {...} or wrapped with if(), #2478 #609
DT = data.table(a = c(1, 1, 2), b = 4:6)
test(2121.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)])
test(2121.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)])
test(2121.3, DT[ , if (.N > 1L) .(b), by=a], DT[1:2])
test(2121.4, DT[ , if (.N > 1L) .(b) else .(c=b), by=a], DT[ , .(a, c=b)],
warning="Different branches of j expression produced different auto-named columns")
test(2121.5, DT[, .(.N=.N), by=a], data.table(a=c(1,2), .N=2:1)) # user supplied names preside over autoname dropping leading dot
## { ending in NULL should retain that NULL, #4061
test(2121.6, DT[ , {.(a, b=b+1); NULL}], NULL)
# reference count when deleting columns; 4093
DT = data.table(x=1:2, y=3:4, z <- 5:6)
DT[, x := NULL]
DT[, y := NULL]
test(2122.1, .Internal(inspect(DT,1)), output="VECSXP.*INTSXP.*[[].*(NAM|REF).*[]].*5,6.*ATTRIB")
test(2122.2, DT, data.table(V3=5:6))
# aggregating and grouping by same column #3103
dt = data.table(SomeNumberA=c(1,1,1),SomeNumberB=c(1,1,1))
test(2123, dt[, .(.N, TotalA=sum(SomeNumberA), TotalB=sum(SomeNumberB)), by=SomeNumberA], data.table(SomeNumberA=1, N=3L, TotalA=1, TotalB=3))
# system timezone is not usually UTC, so as.ITime.POSIXct shouldn't assume so, #4085
oldtz=Sys.getenv('TZ', unset=NA)
Sys.setenv(TZ='Asia/Jakarta') # UTC+7
t0 = as.POSIXct('2019-10-01')
test(2124.1, format(as.ITime(t0)), '00:00:00')
test(2124.2, format(as.IDate(t0)), '2019-10-01')
if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz)
# careful to unset because TZ="" means UTC whereas unset TZ means local
# trunc.cols in print.data.table, #4074
old_width = options("width" = 40)
# Single row printing (to check issue with losing attributes)
DT = data.table(a = "aaaaaaaaaaaaa",
b = "bbbbbbbbbbbbb",
c = "ccccccccccccc",
d = "ddddddddddddd")
test(2125.01,
capture.output(print(DT, trunc.cols=TRUE))[3],
"2 variables not shown: [c, d]")
# Printing with dots
DT = data.table(a = vector("integer", 102),
b = "bbbbbbbbbbbbb",
c = "ccccccccccccc",
d = c("ddddddddddddd", "d"))
test(2125.02, capture.output(print(DT, trunc.cols=TRUE)),
c(" a b c",
" 1: 0 bbbbbbbbbbbbb ccccccccccccc",
" 2: 0 bbbbbbbbbbbbb ccccccccccccc",
" 3: 0 bbbbbbbbbbbbb ccccccccccccc",
" 4: 0 bbbbbbbbbbbbb ccccccccccccc",
" 5: 0 bbbbbbbbbbbbb ccccccccccccc",
" --- ",
" 98: 0 bbbbbbbbbbbbb ccccccccccccc",
" 99: 0 bbbbbbbbbbbbb ccccccccccccc",
"100: 0 bbbbbbbbbbbbb ccccccccccccc",
"101: 0 bbbbbbbbbbbbb ccccccccccccc",
"102: 0 bbbbbbbbbbbbb ccccccccccccc",
"1 variable not shown: [d]"))
test(2125.03, capture.output(print(DT, trunc.cols=TRUE, row.names=FALSE)),
c(" a b c",
" 0 bbbbbbbbbbbbb ccccccccccccc",
" 0 bbbbbbbbbbbbb ccccccccccccc",
" 0 bbbbbbbbbbbbb ccccccccccccc",
" 0 bbbbbbbbbbbbb ccccccccccccc",
" 0 bbbbbbbbbbbbb ccccccccccccc",
"--- ",
" 0 bbbbbbbbbbbbb ccccccccccccc",
" 0 bbbbbbbbbbbbb ccccccccccccc",
" 0 bbbbbbbbbbbbb ccccccccccccc",
" 0 bbbbbbbbbbbbb ccccccccccccc",
" 0 bbbbbbbbbbbbb ccccccccccccc",
"1 variable not shown: [d]" ))
test(2125.04, capture.output(print(DT, trunc.cols=TRUE, class=TRUE))[14],
"1 variable not shown: [d <char>]")
test(2125.05, capture.output(print(DT, trunc.cols=TRUE, class=TRUE, row.names=FALSE))[c(1,14)],
c(" a b c",
"1 variable not shown: [d <char>]" ))
test(2125.06, capture.output(print(DT, trunc.cols=TRUE, col.names="none"))[c(1,12)],
c(" 1: 0 bbbbbbbbbbbbb ccccccccccccc",
"1 variable not shown: [d]" ))
test(2125.07, capture.output(print(DT, trunc.cols=TRUE, class=TRUE, col.names="none"))[c(1,13)],
c(" 1: 0 bbbbbbbbbbbbb ccccccccccccc",
"1 variable not shown: [d]" ),
warning = "Column classes will be suppressed when col.names is 'none'")
options("width" = 20)
DT = data.table(a = vector("integer", 2),
b = "bbbbbbbbbbbbb",
c = "ccccccccccccc",
d = "ddddddddddddd")
test(2125.08, capture.output(print(DT, trunc.cols=TRUE)),
c(" a b",
"1: 0 bbbbbbbbbbbbb",
"2: 0 bbbbbbbbbbbbb",
"2 variables not shown: [c, d]"))
options("width" = 10)
DT = data.table(a = "aaaaaaaaaaaaa",
b = "bbbbbbbbbbbbb",
c = "ccccccccccccc",
d = "ddddddddddddd")
test(2125.09, capture.output(print(DT, trunc.cols=TRUE)),
"4 variables not shown: [a, b, c, d]")
test(2125.10, capture.output(print(DT, trunc.cols=TRUE, class=TRUE)),
"4 variables not shown: [a <char>, b <char>, c <char>, d <char>]")
options(old_width)
# segfault when i is NULL or zero-column, #4060
DT = data.table(A="a", key="A")
test(2126.1, DT[J(NULL)], DT[0])
test(2126.2, DT[data.table()], DT[0])
# additional segfault when i is NULL and roll = 'nearest'
test(2126.3, DT[J(NULL), roll = 'nearest'], DT[0])
test(2126.4, DT[data.table(), roll = 'nearest'], DT[0])
# fcase, #3823
test_vec1 = -5L:5L < 0L
test_vec2 = -5L:5L > 0L
test_vec3 = -5L:5L < 5L
test_vec_na1 = c(test_vec1, NA)
test_vec_na2 = c(test_vec2, NA)
out_vec = c(1,1,1,1,1,NA,0,0,0,0,0)
out_vec_def = c(1,1,1,1,1,2,0,0,0,0,0)
out_vec_na= c(1,1,1,1,1,NA,0,0,0,0,0,NA)
out_vec_oc= c(1,1,1,1,1,NA,NA,NA,NA,NA,NA)
test(2127.01, fcase(test_vec1, 1L, test_vec2, 0L), as.integer(out_vec))
test(2127.02, fcase(test_vec1, 1, test_vec2, 0), out_vec)
test(2127.03, fcase(test_vec1, "1", test_vec2, "0"), as.character(out_vec))
test(2127.04, fcase(test_vec1, TRUE, test_vec2, FALSE), as.logical(out_vec))
test(2127.05, fcase(test_vec1, 1+0i, test_vec2, 0+0i), as.complex(out_vec))
test(2127.06, fcase(test_vec1, list(1), test_vec2, list(0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0))
test(2127.07, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14")), c(rep(as.Date("2019-10-11"),5),NA,rep(as.Date("2019-10-14"),5)))
test(2127.08, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3])), factor(c(rep("a",5),NA,rep("b",5)), levels=letters[1:3]))
test(2127.09, fcase(test_vec1, 1L, test_vec2, 0L, default=2L), as.integer(out_vec_def))
test(2127.10, fcase(test_vec1, 1, test_vec2, 0,default=2), out_vec_def)
test(2127.11, fcase(test_vec1, "1", test_vec2, "0", default ="2"), as.character(out_vec_def))
test(2127.12, fcase(test_vec1, TRUE, test_vec2, FALSE, default=TRUE), as.logical(out_vec_def))
test(2127.13, fcase(test_vec1, 1+0i, test_vec2, 0+0i, default=2+0i), as.complex(out_vec_def))
test(2127.14, fcase(test_vec1, list(1), test_vec2, list(0),default=list(2)), list(1,1,1,1,1, 2, 0, 0, 0, 0, 0))
test(2127.15, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default=as.Date("2019-10-15")), c(rep(as.Date("2019-10-11"),5),as.Date("2019-10-15"),rep(as.Date("2019-10-14"),5)))
test(2127.16, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3]),default=factor("c", levels=letters[1:3])), factor(c(rep("a",5),"c",rep("b",5)), levels=letters[1:3]))
test(2127.17, fcase(test_vec1, as.raw(1), test_vec2, as.raw(0)), error="Type raw is not supported.")
test(2127.18, fcase(test_vec1, factor("a", levels=letters[1]), test_vec2, factor("b", levels=letters[1:3])), error="Argument #2 and argument #4 are both factor but their levels are different.")
test(2127.19, fcase(test_vec1, factor("a", levels=letters[1:2]), test_vec2, factor("b", levels=letters[1:2]),default=factor("c", levels=letters[1:3])), error="Resulting value and 'default' are both type factor but their levels are different.")
test(2127.20, fcase(test_vec1, 1L:10L, test_vec2, 3L:12L, test_vec2), error="Received 5 inputs; please supply an even number of arguments in ..., consisting of logical condition, resulting value pairs (in that order). Note that the default argument must be named explicitly, e.g., default=0")
test(2127.21, fcase(test_vec1, 1L, test_vec2, 3), error="Argument #4 is of type double, however argument #2 is of type integer. Please make sure all output values have the same type.")
test(2127.22, fcase(test_vec1, "FALSE", test_vec2, TRUE), error="Argument #4 is of type logical, however argument #2 is of type character. Please make sure all output values have the same type.")
test(2127.23, fcase(test_vec1, "FALSE", test_vec2, 5L), error="Argument #4 is of type integer, however argument #2 is of type character. Please make sure all output values have the same type.")
test(2127.24, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default="2019-10-15"), error="Resulting value is of type double but 'default' is of type character. Please make sure that both arguments have the same type.")
test(2127.25, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default=123), error="Resulting value has different class than 'default'. Please make sure that both arguments have the same class.")
if(test_bit64) {
i=as.integer64(1:12)+3e9
test(2127.26, fcase(test_vec_na1, i, test_vec_na2, i+100), c(i[1L:5L], as.integer64(NA),i[7L:11L]+100, as.integer64(NA)))
}
if(test_nanotime) {
n=nanotime(1:12)
test(2127.27, fcase(test_vec_na1, n, test_vec_na2, n+100), c(n[1L:5L], nanotime(NA),n[7L:11L]+100, as.integer64(NA)))
}
test(2127.28, fcase(test_vec1, rep(1L,11L), test_vec2, rep(0L,11L)), as.integer(out_vec))
test(2127.29, fcase(test_vec1, rep(1,11L), test_vec2, rep(0,11L)), out_vec)
test(2127.30, fcase(test_vec1, rep("1",11L), test_vec2, rep("0",11L)), as.character(out_vec))
test(2127.31, fcase(test_vec1, rep(TRUE,11L), test_vec2, rep(FALSE,11L)), as.logical(out_vec))
test(2127.32, fcase(test_vec1, rep(1+0i,11L), test_vec2, rep(0+0i,11L)), as.complex(out_vec))
test(2127.33, fcase(test_vec1, rep(list(1),11L), test_vec2, rep(list(0),11L)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0))
test(2127.34, fcase(test_vec1, rep(as.Date("2019-10-11"),11L), test_vec2, rep(as.Date("2019-10-14"),11L)), c(rep(as.Date("2019-10-11"),5),NA,rep(as.Date("2019-10-14"),5)))
test(2127.35, fcase(test_vec1, rep(factor("a", levels=letters[1:3]),11L), test_vec2, rep(factor("b", levels=letters[1:3]),11L)), factor(c(rep("a",5),NA,rep("b",5)), levels=letters[1:3]))
test(2127.36, fcase(test_vec_na1, 1L, test_vec_na2, 0L), as.integer(out_vec_na))
test(2127.37, fcase(test_vec_na1, 1, test_vec_na2, 0), out_vec_na)
test(2127.38, fcase(test_vec_na1, "1", test_vec_na2, "0"), as.character(out_vec_na))
test(2127.39, fcase(test_vec_na1, TRUE, test_vec_na2, FALSE), as.logical(out_vec_na))
test(2127.40, fcase(test_vec_na1, 1+0i, test_vec_na2, 0+0i), as.complex(out_vec_na))
test(2127.41, fcase(test_vec_na1, list(1), test_vec_na2, list(0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0,NULL))
test(2127.42, fcase(c(TRUE,TRUE,TRUE,FALSE,FALSE),factor(NA,levels=letters[1:5]),c(FALSE,FALSE,FALSE,TRUE,TRUE),factor(letters[1:5])),factor(c(NA,NA,NA,"d","e"),levels=letters[1:5]))
test(2127.43, fcase(c(TRUE,TRUE,TRUE,FALSE,NA,FALSE),factor(NA,levels=letters[1:6]),c(FALSE,FALSE,FALSE,TRUE,NA,TRUE),factor(letters[1:6])),factor(c(NA,NA,NA,"d",NA,"f"),levels=letters[1:6]))
test(2127.44, fcase(c(TRUE,TRUE,TRUE,FALSE,NA,FALSE),factor(letters[1:6]),c(FALSE,FALSE,FALSE,TRUE,NA,TRUE),factor(NA,levels = letters[1:6])),factor(c("a","b","c",NA,NA,NA),levels=letters[1:6]))
test(2127.45, fcase(c(TRUE,NA,TRUE,FALSE,FALSE,FALSE),factor(NA),c(TRUE,TRUE,TRUE,FALSE,NA,FALSE),factor(NA)),factor(c(NA,NA,NA,NA,NA,NA)))
test(2127.46, fcase(TRUE, list(data.table(1:5)), FALSE, list(data.table(5:1))), list(data.table(1:5)))
test(2127.47, fcase(FALSE, list(data.table(1:5)), TRUE, list(data.table(5:1))), list(data.table(5:1)))
test(2127.48, fcase(TRUE, list(data.frame(1:5)), FALSE, list(data.frame(5:1))), list(data.frame(1:5)))
test(2127.49, fcase(FALSE, list(data.frame(1:5)), TRUE, list(data.frame(5:1))), list(data.frame(5:1)))
test(2127.50, fcase(1L,1L,TRUE,0L), error = "Argument #1 must be logical.")
test(2127.51, fcase(TRUE,1L,5L,0L), 1L)
test(2127.52, fcase(test_vec1, 1L, test_vec2, 0L, test_vec3, 2L), as.integer(out_vec_def))
test(2127.53, fcase(test_vec1, 1, test_vec2, 0, test_vec3, 2), out_vec_def)
test(2127.54, fcase(test_vec1, "1", test_vec2, "0", test_vec3, "2"), as.character(out_vec_def))
test(2127.55, fcase(test_vec1, TRUE, test_vec2, FALSE, test_vec3, TRUE), as.logical(out_vec_def))
test(2127.56, fcase(test_vec1, 1+0i, test_vec2, 0+0i, test_vec3, 2+0i), as.complex(out_vec_def))
test(2127.57, fcase(test_vec1, list(1), test_vec2, list(0), test_vec3, list(2)), list(1,1,1,1,1, 2, 0, 0, 0, 0, 0))
test(2127.58, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"), test_vec3, as.Date("2019-10-15")), c(rep(as.Date("2019-10-11"),5),as.Date("2019-10-15"),rep(as.Date("2019-10-14"),5)))
test(2127.59, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3]), test_vec3, factor("c", levels=letters[1:3])), factor(c(rep("a",5),"c",rep("b",5)), levels=letters[1:3]))
test(2127.60, fcase(test_vec1, 1L), as.integer(out_vec_oc))
test(2127.61, fcase(test_vec1, 1), out_vec_oc)
test(2127.62, fcase(test_vec1, "1"), as.character(out_vec_oc))
test(2127.63, fcase(test_vec1, TRUE), as.logical(out_vec_oc))
test(2127.64, fcase(test_vec1, 1+0i), as.complex(out_vec_oc))
test(2127.65, fcase(test_vec1, list(1)), list(1,1,1,1,1, NULL, NULL, NULL, NULL, NULL, NULL))
test(2127.66, fcase(test_vec1, as.Date("2019-10-11")), c(rep(as.Date("2019-10-11"),5),rep(NA,6)))
test(2127.67, fcase(test_vec1, factor("a", levels=letters[1:3])), factor(c(rep("a",5),rep("NA",6)), levels=letters[1:3]))
test(2127.68, fcase(test_vec1, 1L, default = 1:2), error = "Length of 'default' must be 1.")
test(2127.69, fcase(test_vec1, 1L, test_vec_na1, 2L), error = "Argument #3 has a different length than argument #1. Please make sure all logical conditions have the same length.")
test(2127.70, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, 2), error = "Argument #4 has different class than argument #2, Please make sure all output values have the same class.")
test(2127.71, fcase(test_vec1, 1L, test_vec2, 2:3), error = "Length of output value #4 must either be 1 or length of logical condition.")
test(2127.72, fcase(TRUE, 1L, FALSE, stop("bang!")), 1L)
test(2127.73, fcase(test_vec1, 1L, test_vec2, 0:10), as.integer(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10)))
test(2127.74, fcase(test_vec1, 0:10, test_vec2, 0L), as.integer(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0)))
test(2127.75, fcase(test_vec1, 1, test_vec2, as.numeric(0:10)), as.numeric(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10)))
test(2127.76, fcase(test_vec1, as.numeric(0:10), test_vec2, 0), as.numeric(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0)))
test(2127.77, fcase(test_vec1, "1", test_vec2, as.character(0:10)), as.character(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10)))
test(2127.78, fcase(test_vec1, as.character(0:10), test_vec2, "0"), as.character(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0)))
test(2127.79, fcase(test_vec1, TRUE, test_vec2, rep(FALSE, 11L)), as.logical(out_vec))
test(2127.80, fcase(test_vec1, rep(TRUE, 11L), test_vec2, FALSE), as.logical(out_vec))
test(2127.81, fcase(test_vec1, 1+0i, test_vec2, rep(0+0i, 11L)), as.complex(out_vec))
test(2127.82, fcase(test_vec1, rep(1+0i, 11L), test_vec2, 0+0i), as.complex(out_vec))
test(2127.83, fcase(test_vec1, list(rep(1, 11L)), test_vec2, list(0)), list(rep(1, 11L),rep(1, 11L),rep(1, 11L),rep(1, 11L),rep(1, 11L), NULL, 0, 0, 0, 0, 0))
test(2127.84, fcase(test_vec1, list(1), test_vec2, list(rep(0,11L))), list(1,1,1,1,1, NULL, rep(0,11L), rep(0,11L), rep(0,11L), rep(0,11L), rep(0,11L)))
test(2127.85, fcase(test_vec1, list(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), test_vec2, list(0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0))
test(2127.86, fcase(test_vec1, list(1), test_vec2, list(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0))
# .SDcols accepts a function, #3950
DT = data.table(a=1:5, b=2, c='a')
test(2128.1, names(DT[, .SD, .SDcols=is.numeric]), c('a', 'b'))
test(2128.2, names(DT[, .SD, .SDcols=!is.numeric]), 'c')
test(2128.3, DT[, .SD, .SDcols=function(x) x==1], error='conditions were not met for: [a, b, c]')
test(2128.4, DT[, .SD, .SDcols=function(x) 2L], error='conditions were not met for: [a, b, c]')
test(2128.5, DT[, .SD, .SDcols=function(x) NA], error='conditions were not met for: [a, b, c]')
# expression columns in rbindlist, #546
A = data.table(c1 = 1, c2 = 'asd', c3 = expression(as.character(Sys.time())))
B = data.table(c1 = 3, c2 = 'qwe', c3 = expression(as.character(Sys.time()+5)))
test(2129, rbind(A,B)$c3, expression(as.character(Sys.time()), as.character(Sys.time()+5)))
# print dims in list-columns, #3671
DT = data.table(
x = 1:2,
y = list(data.table(x=1, y=1),
data.table(x=2, y=2)))
test(2130.01, print(DT), output=c(" x y", "1: 1 <data.table[1x2]>", "2: 2 <data.table[1x2]>"))
DT = data.table(
x = 1:2,
y = list(list(x=1, y=c("yes", "no")),
list(x=2, y=2)))
test(2130.02, print(DT), output=c(" x y", "1: 1 <list[2]>", "2: 2 <list[2]>"))
s4class = setClass("ex_class", slots = list(x="integer", y="character", z="numeric"))
DT = data.table(
x = 1:2,
y = list(s4class(x=1L, y=c("yes", "no"), z=2.5),
s4class(x=2L, y="yes", z=1)))
test(2130.03, print(DT), output=c(" x y", "1: 1 <ex_class[3]>", "2: 2 <ex_class[3]>"))
# .SD from grouping should be unlocked, part of #4159
x = data.table(a=1:3, b=4:6)
test(2131.1, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.locked'),
list(NULL, NULL, NULL))
## truly recursive object (contains itself) can cause infinite recursion, #4173
f = function(data) {
x = new.env()
x$a = 2
x$b = x
x
}
dt = data.table(x = rep(1:3, each = 3), y = runif(9))
out = dt[, list(evaluated = list(f(copy(.SD)))), by = x]
test(2131.2, class(out$evaluated[[1L]]), 'environment')
# S4 object not suported in fifelse and fcase, #4135
class2132 = setClass("class2132", slots=list(x="numeric"))
s1 = class2132(x=20191231)
s2 = class2132(x=20191230)
test(2132.1, fifelse(TRUE, s1, s2), error = "S4 class objects (except nanotime) are not supported.")
test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanotime) are not supported.")
test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see")
test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see")
rm(s1, s2, class2132)
if (test_xts) {
# keep.rownames in as.data.table.xts() supports a string, #4232
xts = xts::xts(1:10, structure(1:10, class = "Date"))
colnames(xts) = "VALUE"
DT = as.data.table(xts, keep.rownames = "DATE", key = "DATE")
test(2133.1, colnames(DT), c("DATE", "VALUE"))
test(2133.2, key(DT), "DATE")
test(2133.3, as.data.table(xts, keep.rownames = "VALUE"),
error = "Input xts object should not have 'VALUE' column because it would result in duplicate column names. Rename 'VALUE' column in xts or use `keep.rownames` to change the index column name.")
test(2133.4, as.data.table(xts, keep.rownames = character()),
error = "keep.rownames must be length 1")
test(2133.5, as.data.table(xts, keep.rownames = NA_character_),
error = "keep.rownames must not be NA")
}
# friendlier error for common mistake of using := in i instead of j, #4227
DT = data.table(a = 1)
test(2134.1, DT[b := 2], error="Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j.")
test(2134.2, DT[DT[mpg_per_cl := mpg/cyl], on='ast'], error="Operator := detected in i, the first argument inside")
# new .NGRP symbol, #1206 #3060
DT = data.table(grp1 = rep(1:4, each = 2L), grp2 = rep(1:2, 4L))
test(2135.1, DT[ , .NGRP], 1L)
test(2135.2, DT[ , .NGRP, by = grp1]$NGRP, rep(4L, 4L))
test(2135.3, DT[ , .NGRP, by = grp2]$NGRP, rep(2L, 2L))
test(2135.4, DT[ , .NGRP, by = .(grp1, grp2)]$NGRP, rep(8L, 8L))
# Use of (m)get does not re-order when using .SDcols, #4089
dt = data.table(x = 1L, y = 2L, z = 3L)
cols = c('x', 'y')
test(2136, dt[, (cols) := lapply(.SD[get("x") == 1],function(x){x + 2L}), .SDcols = cols ,by = z], data.table(x = 1L + 2L, y = 2L + 2L, z = 3L))
# round, trunc should all be 'integer' and and have class 'ITime', #4207
start_time = as.POSIXct("2020-01-01 07:00:00", tz='UTC')
l = list(
hour31 = as.ITime(seq(start_time+40, by = "31 min", length.out = 9L)),
hour30 = as.ITime(seq(start_time, by = "30 min", length.out = 9L)),
minute31 = as.ITime(seq(start_time, by = "31 sec", length.out = 9L)),
minute30 = as.ITime(seq(start_time, by = "30 sec", length.out = 9L))
)
ans = list(
a = as.ITime(c("07:00", "08:00", "08:00", "09:00", "09:00", "10:00", "10:00", "11:00", "11:00")),
b = as.ITime(c("07:00", "07:01", "07:01", "07:02", "07:02", "07:03", "07:03", "07:04", "07:04")),
c = as.ITime(c("07:00", "07:00", "08:00", "08:00", "09:00", "09:00", "10:00", "10:00", "11:00")),
d = as.ITime(c("07:00", "07:00", "07:01", "07:01", "07:02", "07:02", "07:03", "07:03", "07:04"))
)
test(2137.01, all(sapply(l, inherits, "ITime")))
test(2137.02, all(sapply(l, typeof) == "integer"))
test(2137.03, which(round(l$hour30, "hours") != ans$a), c(4L, 8L))
test(2137.04, round(l$hour31, "hours"), ans$a)
test(2137.05, which(round(l$minute30, "minutes") != ans$b), c(2L, 6L))
test(2137.06, round(l$minute31, "minutes"), ans$b)
test(2137.07, trunc(l$hour30, "hours"), ans$c)
test(2137.08, trunc(l$hour31, "hours"), ans$c)
test(2137.09, trunc(l$minute30, "minutes"), ans$d)
test(2137.10, trunc(l$minute31, "minutes"), ans$d)
# Complex to character conversion in rbindlist, #4202
A = data.table(A=complex(real = 1:3, imaginary=c(0, -1, 1)))
B = data.table(A=c('qwe', 'asd'))
test(2138.1, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NA))
test(2138.2, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN))
test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
A = data.table(A=as.complex(rep(NA, 5)))
test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
# all.equal ignore row order improperly handle NAs, #4422
d1 = data.table(a=1:2, b=c(1L,NA))
d2 = data.table(a=1:2, b=1:2)
test(2139, all.equal(d1, d2, ignore.row.order=TRUE), "Dataset 'current' has rows not present in 'target'")
# Set allow.cartesian = TRUE when non-equi, #4489
dt = data.table(time = 1:8, v = INT(5,7,6,1,8,4,2,3))
dt[time == 2L, v := 2L]
dt[time == 7L, v := 7L]
test(2140, dt[dt, on=.(time>time, v>v), .N, by=.EACHI], data.table(time=1:8, v=INT(5,2,6,1,8,4,7,3), N=INT(3,5,2,4,0,1,0,0)))
# repeat of test 450 for #4402
test(2141, .Call(Ctest_dt_win_snprintf), NULL)
DT = data.table(a=1:3,b=4:6)
test(2142, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1")
if (.Platform$OS.type=="windows") local({
x = list(
LC_COLLATE = "Chinese (Simplified)_China.936",
LC_CTYPE = "Chinese (Simplified)_China.936",
LC_MONETARY = "Chinese (Simplified)_China.936",
LC_NUMERIC = "C",
LC_TIME = "Chinese (Simplified)_China.936"
)
x_old = Map(Sys.getlocale, names(x))
invisible(Map(Sys.setlocale, names(x), x))
old = Sys.getenv('LANGUAGE')
Sys.setenv('LANGUAGE' = 'zh_CN')
on.exit({
if (nzchar(old))
Sys.setenv('LANGUAGE' = old)
else
Sys.unsetenv('LANGUAGE')
invisible(Map(Sys.setlocale, names(x_old), x_old))
}, add = TRUE)
# triggered segfault here in #4402, Windows-only under translation.
# test that the argument order changes correctly (the 'item 2' moves to the beginning of the message)
# since the argument order changes in this example (and that was the crash) we don't need to test
# the display of the Chinese characters here. Thanks to @shrektan for all his help on this.
test(2143, rbind(DT,list(c=4L,a=7L)), error="2.*1.*c.*1")
})
# test back to English (the argument order is back to 1,c,2,1)
test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1")
# Attempting to join on character(0) shouldn't crash R
A = data.table(A='a')
B = data.table(B='b')
test(2145.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector")
test(2145.2, merge(A, B, by=character(0) ), error = "non-empty vector of column names for `by` is required.")
test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required")
# Also shouldn't crash when using internal functions
test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty')
# nrow(i)==0 by-join, #4364 (broke in dev 1.12.9)
d0 = data.table(id=integer(), n=integer())
d2 = data.table(id=1:2)
test(2146, d2[d0, i.n, on="id", by=.EACHI], data.table(id=integer(), i.n=integer()))
# by=col1:col4 wrong result when key(DT)==c('col1','col4'), #4285
DT = data.table(col1=c(1,1,1), col2=c("a","b","a"), col3=c("A","B","A"), col4=c(2,2,2))
setkey(DT, col1, col4)
test(2147.1, DT[, .N, by = col1:col4], ans<-data.table(col1=1, col2=c("a","b"), col3=c("A","B"), col4=2, N=INT(2,1)))
test(2147.2, DT[, .N, by = c("col1", "col2", "col3", "col4")], ans)
# Result matrix of comparison operators could have its colnames changed by reference, #4323
A = data.table(x=1:2)
B = data.table(x=1:2)
X = A == B
A[, y := 3:4]
test(2148, colnames(X), c('x'))
# shallow() shouldn't take a deep copy of indices, #4311
dt <- data.table(a = c(3, 1))
setindex(dt, a)
dt2 <- shallow(dt)
test(2149.1, address(attr(attr(dt, 'index'), '__a')), address(attr(attr(dt2, 'index'), '__a')))
# Testing possible future regression. shallow() needs to copy the names of indices and keys.
setnames(dt2, 'a', 'A')
test(2149.2, indices(dt), 'a')
setkey(dt, a)
dt2 <- shallow(dt)
setnames(dt2, 'a', 'A')
test(2149.3, key(dt), 'a')
# native reading of [-]?[0-9]+[-][0-9]{2}[-][0-9]{2} dates and
# <date>[T ][0-9]{2}[:][0-9]{2}[:][0-9]{2}(?:[.][0-9]+)?(?:Z|[+-][0-9]{2}[:]?[0-9]{2})? timestamps
dates = as.IDate(c(9610, 19109, 19643, 20385, -1413, 9847, 4116, -11145, -2327, 1760))
times = .POSIXct(tz = 'UTC', c(
937402277.067304, -626563403.382897, -506636228.039861, -2066740882.02417,
-2398617863.28256, -1054008563.60793, 1535199547.55902, 2075410085.54399,
1201364458.72486, 939956943.690777
))
DT = data.table(dates, times)
tmp = tempfile()
## ISO8601 format (%FT%TZ) by default
fwrite(DT, tmp)
test(2150.01, fread(tmp), DT) # defaults for fwrite/fread simple and preserving
fwrite(DT, tmp, dateTimeAs='write.csv') # as write.csv, writes the UTC times as-is not local because the time column has tzone=="UTC", but without the Z marker
oldtz = Sys.getenv("TZ", unset=NA)
Sys.unsetenv("TZ")
test(2150.021, sapply(fread(tmp), typeof), c(dates="integer", times="character")) # as before v1.13.0, datetime with missing timezone read as character
test(2150.022, fread(tmp,tz="UTC"), DT) # user can tell fread to interpet the unmarked datetimes as UTC
Sys.setenv(TZ="UTC")
test(2150.023, fread(tmp), DT) # TZ environment variable is also recognized
if (.Platform$OS.type!="windows") {
Sys.setenv(TZ="") # on Windows this unsets TZ, see ?Sys.setenv
test(2150.024, fread(tmp), DT)
# blank TZ env variable on non-Windows is recognized as UTC consistent with C and R; but R's tz= argument is the opposite and uses "" for local
}
Sys.unsetenv("TZ")
tt = fread(tmp, colClasses=list(POSIXct="times"))
test(2150.025, attr(tt$times, "tzone"), "") # as.POSIXct puts "" on the result (testing the write.csv version here with missing tzone)
# the times will be different though here because as.POSIXct read them as local time.
if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz)
fwrite(copy(DT)[ , times := format(times, '%FT%T+00:00')], tmp)
test(2150.03, fread(tmp), DT)
fwrite(copy(DT)[ , times := format(times, '%FT%T+0000')], tmp)
test(2150.04, fread(tmp), DT)
fwrite(copy(DT)[ , times := format(times, '%FT%T+0115')], tmp)
test(2150.05, fread(tmp), copy(DT)[ , times := times - 4500])
fwrite(copy(DT)[ , times := format(times, '%FT%T+01')], tmp)
test(2150.06, fread(tmp), copy(DT)[ , times := times - 3600])
## invalid tz specifiers
fwrite(copy(DT)[ , times := format(times, '%FT%T+3600')], tmp)
test(2150.07, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+3600')])
fwrite(copy(DT)[ , times := format(times, '%FT%T+36')], tmp)
test(2150.08, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+36')])
fwrite(copy(DT)[ , times := format(times, '%FT%T+XXX')], tmp)
test(2150.09, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+XXX')])
fwrite(copy(DT)[ , times := format(times, '%FT%T+00:XX')], tmp)
test(2150.10, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+00:XX')])
# allow colClasses='POSIXct' to force YMD column to read as POSIXct
test(2150.11,fread("a,b\n2015-01-01,2015-01-01", colClasses="POSIXct"), # local time for backwards compatibility
data.table(a=as.POSIXct("2015-01-01"), b=as.POSIXct("2015-01-01")))
test(2150.12,fread("a,b\n2015-01-01,2015-01-01", select=c(a="Date",b="POSIXct")), # select colClasses form, for coverage
data.table(a=as.Date("2015-01-01"), b=as.POSIXct("2015-01-01")))
test(2150.13, fread("a,b\n2015-01-01,1.1\n2015-01-02 01:02:03,1.2"), # no Z so as character as before v1.13.0
if (TZnotUTC) data.table(a=c("2015-01-01","2015-01-02 01:02:03"), b=c(1.1, 1.2))
else data.table(a=setattr(c(as.POSIXct("2015-01-01",tz="UTC"), as.POSIXct("2015-01-02 01:02:03",tz="UTC")),"tzone","UTC"), b=c(1.1, 1.2)))
# some rows are date-only, some rows UTC-timestamp --> read the date-only in UTC too
test(2150.14, fread("a,b\n2015-01-01,1.1\n2015-01-02T01:02:03Z,1.2"),
data.table(a = .POSIXct(1420070400 + c(0, 90123), tz="UTC"), b = c(1.1, 1.2)))
old = options(datatable.old.fread.datetime.character=TRUE)
test(2150.15, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03T01:02:03Z"),
data.table(a="2015-01-01", b="2015-01-02", c="2015-01-03T01:02:03Z"))
test(2150.16, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")),
ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03")))
ans_print = capture.output(print(ans))
options(datatable.old.fread.datetime.character=NULL)
if (TZnotUTC) {
test(2150.17, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")),
ans, output=ans_print)
test(2150.18, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)),
data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print)
} else {
test(2150.19, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")),
ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print)
test(2150.20, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)),
ans, output=ans_print)
}
options(old)
# 1 is treated as . in dcast formula, #4615
DT = data.table(a = c("s", "x"), survmean = 1:2)
test(2151, dcast(DT, 1 ~ a, value.var='survmean'), data.table('.'='.', s=1L, x=2L, key='.'))
# list object with [[ method that returns itself (e.g. person) lead to infinite loop in copy(), #4620
y = person(given='Joel', family='Mossong')
test(2152, copy(y), y)
# .N and .GRP special statics copied correctly when placed as a vector in a list column; part of PR#4655
# see comments in anySpecialStatic() at the top of dogroups.c
# .SD, .I and .BY are covered by previous tests
DT = data.table(x=c(1L,2L,2L), y=1:3)
test(2153.1, DT[, .(list(.N)), by=x], data.table(x=1:2, V1=as.list(1:2)))
test(2153.2, DT[, .(list(.GRP)), by=x], data.table(x=1:2, V1=as.list(1:2)))
test(2153.3, ans<-DT[, .(list(.NGRP)), by=x], data.table(x=1:2, V1=list(2L,2L)))
test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't change group to group so the same object can be referenced many times unlike .N and .GRP
test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result
data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L))))
# warning message segfault when no column names present, #4644
test(2154.1, fread("0.0\n", colClasses="integer"), data.table(V1=0.0),
warning="Attempt to override column 1 of inherent type 'float64' down to 'int32' ignored.*please")
test(2154.2, fread("A\n0.0\n", colClasses="integer"), data.table(A=0.0),
warning="Attempt to override column 1 <<A>> of inherent type 'float64' down to 'int32' ignored.*please")
# asan heap-use-after-free on list columns with attributes on each item, #4746
DT = data.table(A=INT(1,1,2,3,3,4,5,5,6,7),
B=lapply(1:10, function(x) structure(rnorm(90), foo=c(42,12,36))))
for (i in 0:4) test(2155+i/10,
{ gctorture2(step=20); ans=DT[, .(attr(B[[1L]],"foo")[1L]), by=A]; gctorture2(step=0); gc(); ans },
data.table(A=1:7, V1=42)
)
You can’t perform that action at this time.