Do's and Don'ts

Matt Dowle edited this page Aug 26, 2014 · 3 revisions

1. Don't subset .SD, use the columns directly or use .SDcols

NB : Timings updated with v1.8.2 (improves 3rd case dramatically)

n=10000
DT = data.table(grp=1:n,name=as.character(as.hexmode(1:n)),x=rnorm(10*n),y=rnorm(10*n))
setkey(DT,grp)
system.time(ans1<-DT[,lapply(.SD[,list(x,y)],sum),by=grp])   
#   user  system elapsed 
# 31.130   0.088  31.288    # bad
system.time(ans2<-DT[,lapply(list(x,y),sum),by=grp])         
#   user  system elapsed 
#  0.284   0.004   0.291    # good
setnames(ans2,names(ans1))
identical(ans1,ans2)
# [1] TRUE
system.time(ans3<-DT[,lapply(.SD,sum),by=grp,.SDcols=c("x","y")])
#   user  system elapsed
#  0.080   0.004   0.085    # even better (prior to v1.8.2 was slower and not recommended, no longer)
identical(ans1,ans3)
# [1] TRUE
tables()
#      NAME    NROW MB COLS         KEY
# [1,] ans1  10,000 1  grp,x,y      grp
# [2,] ans2  10,000 1  grp,x,y      grp
# [3,] ans3  10,000 1  grp,x,y      grp
# [4,] DT   100,000 3  grp,name,x,y grp
# Total: 6MB

NB: The motivation for lapply through a subset of .SD is when there are non-group column(s) in DT (such as name above). Otherwise you could just do DT[,lapply(.SD,sum),by=grp]. Subsetting .SD (as in ans1) is so much slower because a new copy of .SD for those columns is created for each and every group.

2. For speed use := by group, don't transform() by group or cbind() afterwards

As from v1.8.2, := by group has been implemented.

library(data.table)
n=10000
DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
setkey(DT,grp)
system.time(ans1 <- DT[,transform(.SD,x2=x/sum(x),y2=y/sum(y)),by=grp])
# user  system elapsed 
# 5.46    0.00    5.48  # slow
head(ans1,3)
#    grp          x           y        x2         y2
# 1:   1 -0.5848814 -0.41560829 0.6241268  0.5695575
# 2:   1 -0.6314059 -0.49076645 0.6737731  0.6725557
# 3:   1 -1.7694071  0.08860505 1.8881340 -0.1214260
system.time(tt <- DT[,list(x2=x/sum(x),y2=y/sum(y)),by=grp])
# user  system elapsed 
# 0.02    0.00    0.02 (274 times faster!!!)

head(tt,3)
#    grp        x2         y2
# 1:   1 0.6241268  0.5695575
# 2:   1 0.6737731  0.6725557
# 3:   1 1.8881340 -0.1214260
system.time(ans2 <- cbind(DT,tt[,list(x2,y2)]))
# user  system elapsed 
# 0.05    0.00    0.05    # very fast to add afterwards in bulk
head(ans2,3)
#     grp          x           y        x2         y2
# 1:   1 -0.5848814 -0.41560829 0.6241268  0.5695575
# 2:   1 -0.6314059 -0.49076645 0.6737731  0.6725557
# 3:   1 -1.7694071  0.08860505 1.8881340 -0.1214260
setkey(ans2,grp)
identical(ans1,ans2)
[1] TRUE
system.time(DT[, c('x2', 'y2') := list(x / sum(x), y / sum(y)), by = grp])
# user  system elapsed 
# 0.07    0.00    0.07 # equivalent to cbind afterwards approach, but more memory efficient

# now DT has been updated 
identical(ans1, DT)
# [1] TRUE
# remove new columns to show different approach
DT[, c('x2', 'y2') := NULL]

system.time(DT[, `:=`(x2=x / sum(x),y2= y / sum(y)), by = grp])
# user  system elapsed 
#0.04    0.00    0.05 # this is slightly faster 
identical(ans1, DT)
# [1] TRUE

3. Method dispatch takes time.

When called repeatedly, it may be surprising to learn that mean() is inefficient compared to sum(). mean() is S3 generic which means it takes time to dispatch to one of methods(mean), usually mean.default(). Also, mean.default() contains a few lines of R before finally calling .Internal(mean(x)). sum() on the other hand is a primitive function.

For this reason data.table will optimize calls to mean() automatically.
TO DO. Revision needed here for v1.9.2

library(data.table)
n<-100000
DT<-data.table(grp1=sample(1:750, n, replace=TRUE),
                 grp2=sample(1:750, n, replace=TRUE),
                 x=rnorm(n),
                 y=rnorm(n))
system.time(ans1<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2), verbose = TRUE])
# Detected that j uses these columns: x,y 
# Optimized j from 'list(mean(x), mean(y))' to 'list(.External(Cfastmean, x, FALSE), .External(Cfastmean, y, FALSE))'
# Starting dogroups ... done dogroups in 0.2 secs
# user  system elapsed 
# 0.25    0.00    0.25 

# without optimization
options(datatable.optimize = 0)
system.time(ans2<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose = TRUE])
# Finding groups (bysameorder=FALSE) ... done in 0.04secs. bysameorder=FALSE and o__ is length 100000
# Detected that j uses these columns: x,y 
# Starting dogroups ... done dogroups in 1.61 secs
# user  system elapsed 
# 1.64    0.00    1.65    # 6.6 times faster
identical(ans1,ans2)
# [1] TRUE

median() is another S3 generic with overhead that can be sped up a lot using a similar technique. However, sum(), length(), var() and many other functions don't suffer the same performance issue :

system.time(DT[,list(sum(x),sum(y)),by=list(grp1,grp2)])
#   user  system elapsed 
#   0.19    0.00    0.19    # not much slower than optimized mean

4. Don't coerce j to list, use list() directly

> n=10000
> DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
> setkey(DT,grp)
> system.time(ans1<-DT[,as.list(cbind(sum(x),sum(y))),by=grp])
   user  system elapsed 
  1.472   0.016   1.494    # bad
> system.time(ans2<-DT[,data.table(sum(x),sum(y)),by=grp])
   user  system elapsed 
 18.494   0.308  18.914    # worse
> system.time(ans3<-DT[,data.frame(sum(x),sum(y)),by=grp])
   user  system elapsed 
 40.502   1.136  41.807    # terrible
> colnames(ans3)=colnames(ans1)
> system.time(ans4<-DT[,list(sum(x),sum(y)),by=grp])
   user  system elapsed 
  0.176   0.004   0.181    # best
> identical(ans1,ans2)
[1] TRUE
> identical(ans1,ans3)
[1] TRUE
> identical(ans1,ans4)
[1] TRUE

5. Don't use colSums, use lapply on .SD

> n=10000
> DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
> setkey(DT,grp)
> system.time(ans1<-DT[,as.list(colSums(.SD)),by=grp])
    user  system elapsed 
  1.296   0.000   1.298 #bad
> system.time(ans2<-DT[,lapply(.SD,sum),by=grp])
  user  system elapsed 
  0.020   0.000   0.021 #best
> identical(ans1,ans2)
[1] TRUE
> system.time(ans3<-DT[,list(x=sum(x),y=sum(y)),by=grp])
    user  system elapsed 
  0.020   0.000   0.022  #similar but longer to write
> identical(ans1,ans3)
[1] TRUE