Join GitHub today
GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together.
Sign up[R-Forge #5754] GForce functions and row- + col-wise operations on .SD #523
Comments
|
Benchmarks for Data:require(data.table)
set.seed(2L)
k = 1e4
n = 1e6
is_na = TRUE
dt <- setDT(lapply(1:100, function(x) sample(c(1:k, if(is_na) NA_integer_), n, TRUE)))min, no na.rm# with GForce (default)
options(datatable.optimize=2L)
system.time(ans1 <- dt[, lapply(.SD, min), by=V1])
# user system elapsed
# 0.533 0.012 0.547
# without
options(datatable.optimize=1L)
system.time(ans2 <- dt[, lapply(.SD, min), by=V1])
# user system elapsed
# 4.698 0.025 4.761
identical(ans1, ans2) # [1] TRUEmin, with na.rm# with GForce (default)
options(datatable.optimize=2L)
system.time(ans1 <- dt[, lapply(.SD, min, na.rm=TRUE), by=V1])
# user system elapsed
# 0.481 0.016 0.568
# without
options(datatable.optimize=1L)
system.time(ans2 <- dt[, lapply(.SD, function(x) min(x, na.rm=TRUE)), by=V1])
# user system elapsed
# 5.623 0.023 5.791
identical(ans1, ans2) # [1] TRUEmax, no na.rm# with GForce (default)
options(datatable.optimize=2L)
system.time(ans1 <- dt[, lapply(.SD, max), by=V1])
# user system elapsed
# 0.536 0.014 0.585
# without
options(datatable.optimize=1L)
system.time(ans2 <- dt[, lapply(.SD, max), by=V1])
# user system elapsed
# 5.069 0.029 5.351
identical(ans1, ans2) # [1] TRUEmax, with na.rm# with GForce (default)
options(datatable.optimize=2L)
system.time(ans1 <- dt[, lapply(.SD, max, na.rm=TRUE), by=V1])
# user system elapsed
# 0.517 0.011 0.546
# without
options(datatable.optimize=1L)
system.time(ans2 <- dt[, lapply(.SD, function(x) max(x, na.rm=TRUE)), by=V1])
# user system elapsed
# 5.862 0.025 6.064
identical(ans1, ans2) # [1] TRUEAnd here's a comparison putting everything together: options(datatable.optimize=2L)
system.time(ans1 <- dt[, c(lapply(.SD, sum), lapply(.SD, mean),
lapply(.SD, min), lapply(.SD, max), .N), by=V1])
# user system elapsed
# 2.463 0.018 2.575
options(datatable.optimize=1L)
system.time(ans2 <- dt[, c(lapply(.SD, sum), lapply(.SD, mean),
lapply(.SD, min), lapply(.SD, max), .N), by=V1])
# user system elapsed
# 11.840 0.034 11.987
identical(ans1, ans2) # [1] TRUE |
|
Ideally, quantile, cov & corr would be great. |
|
|
|
without
|
|
Benchmarks for options(datatable.optimize=Inf)
system.time(ans1 <- dt[, head(.SD, 1), by=V1]) # gforce optimised
# 0.03 seconds
options(datatable.optimize=1)
system.time(ans2 <- dt[, head(.SD, 1), by=V1]) # level-1 optimisation
# 10 seconds
options(datatable.optimize=0)
system.time(ans3 <- dt[, head(.SD, 1), by=V1]) # no optimisation
# 45 seconds
# restore optimisation
options(datatable.optimize=Inf)works with subsets in |
|
Benchmark for options(datatable.optimize=Inf)
system.time(ans1 <- dt[, .SD[2], by=V1]) # gforce optimised
# 0.03 seconds
options(datatable.optimize=1L)
system.time(ans2 <- dt[, .SD[2], by=V1]) # level-1 optimisation
# 1.75 seconds
options(datatable.optimize=0L)
system.time(ans3 <- dt[, .SD[2], by=V1]) # no optimisation
# 41 seconds
# restore optimisation
options(datatable.optimize=Inf)works with subsets in |
|
Any plans for optimization of |
|
var# with GForce (default)
system.time(ans1 <- dt[, lapply(.SD, var, na.rm=TRUE), by=V1])
# user system elapsed
# 1.273 0.010 1.294
# without
system.time(ans2 <- dt[, lapply(.SD, stats::var, na.rm=TRUE), by=V1])
# user system elapsed
# 27.106 0.369 27.635
all.equal(ans1, ans2) # [1] TRUEsd# with GForce (default)
system.time(ans1 <- dt[, lapply(.SD, sd, na.rm=TRUE), by=V1])
# user system elapsed
# 1.227 0.007 1.242
# without
system.time(ans2 <- dt[, lapply(.SD, stats::sd, na.rm=TRUE), by=V1])
# user system elapsed
# 28.428 0.406 29.172
all.equal(ans1, ans2) # [1] TRUE |
|
Could |
|
This may be a bad fit for GForce, but it would be nice to have an optimized version of |
|
How about
Seems |
|
Any idea if
|
|
@kdkavanagh Fyi, for that you can create a column with |
Submitted by: Arun ; Assigned to: Nobody; R-Forge link
For GForce
[for length-1 subsetsWhen
GForceis upgraded to work with:=:Utility function
It should return a list. That is,