Closes #1878 and #3185 -- .SDcols accepts patterns

Rdatatable · Dec 5, 2018 · b24f490 · b24f490
1 parent a3dafe3
commit b24f490
Show file tree

Hide file tree

Showing 6 changed files with 86 additions and 24 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -12,6 +12,8 @@
 
 4. `NA` in `between`'s `lower` and `upper` are now taken as missing bounds and return `TRUE` rather than than `NA`. This is now documented.
 
+5. `.SDcols` in `[.data.table` now accepts `patterns`, similar to the existing usage in `melt.data.table`, for filtering columns according to a pattern, concisely and dynamically, [#1878](https://github.com/Rdatatable/data.table/issues/1878). Thanks to many people for pushing for this and @MichaelChirico for ultimately filing the PR. See `?data.table` for full details and examples. 
+
 #### BUG FIXES
 
 1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting.

diff --git a/R/data.table.R b/R/data.table.R
@@ -1016,8 +1016,14 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
             # .SDcols is of the format a:b
             .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names(x)), parent.frame())
           } else {
-            .SDcols = eval(colsub, parent.frame(), parent.frame())
+            if (is.call(colsub) && colsub[[1L]] == "patterns") {
+              # each pattern gives a new filter condition, intersect the end result
+              .SDcols = Reduce(intersect, do_patterns(colsub, names(x)))
+            } else {
+              .SDcols = eval(colsub, parent.frame(), parent.frame())
+            }
           }
+          if (!length(.SDcols)) return(null.data.table())
           if (anyNA(.SDcols))
             stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols))))
           if (is.logical(.SDcols)) {

diff --git a/R/fmelt.R b/R/fmelt.R
@@ -34,18 +34,7 @@ melt.data.table <- function(data, id.vars, measure.vars, variable.name = "variab
   if (missing(measure.vars)) measure.vars = NULL
   measure.sub = substitute(measure.vars)
   if (is.call(measure.sub) && measure.sub[[1L]] == "patterns") {
-    measure.sub = as.list(measure.sub)[-1L]
-    idx = which(names(measure.sub) == "cols")
-    if (length(idx)) {
-      cols = eval(measure.sub[["cols"]], parent.frame())
-      measure.sub = measure.sub[-idx]
-    } else cols = names(data)
-    pats = lapply(measure.sub, eval, parent.frame())
-    measure.vars = patterns(pats, cols=cols)
-    # replace with lengths when R 3.2.0 dependency arrives
-    if (length(idx <- which(sapply(measure.vars, length) == 0L)))
-      stop('Pattern', if (length(idx) > 1L) 's', ' not found: [',
-           paste(pats[idx], collapse = ', '), ']')
+    measure.vars = do_patterns(measure.sub, names(data))
   }
   if (is.list(measure.vars) && length(measure.vars) > 1L) {
     meas.nm = names(measure.vars)

diff --git a/R/utils.R b/R/utils.R
@@ -93,3 +93,23 @@ brackify = function(x) {
   if (length(x) > 10L) x = c(x[1:10], '...')
   sprintf('[%s]', paste(x, collapse = ', '))
 }
+
+# patterns done via NSE in melt.data.table and .SDcols in `[.data.table`
+do_patterns = function(pat_sub, all_cols) {
+  # received as substitute(patterns(...))
+  pat_sub = as.list(pat_sub)[-1L]
+  # identify cols = argument if present
+  idx = which(names(pat_sub) == "cols")
+  if (length(idx)) {
+    cols = eval(pat_sub[["cols"]], parent.frame(2L))
+    pat_sub = pat_sub[-idx]
+  } else cols = all_cols
+  pats = lapply(pat_sub, eval, parent.frame(2L))
+  matched = patterns(pats, cols=cols)
+  # replace with lengths when R 3.2.0 dependency arrives
+  if (length(idx <- which(sapply(matched, length) == 0L)))
+    stop('Pattern', if (length(idx) > 1L) 's', ' not found: [',
+         paste(pats[idx], collapse = ', '), ']')
+
+  return(matched)
+}
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -12468,6 +12468,43 @@ gs = groupingsets(d, j = sum(val), by = c("a", "b", "c"),
                            character()), id=TRUE)
 test(1961, cb, gs)
 
+# #3185 -- .SDcols = integer(0L) completes gracefully
+DT = data.table(a = 1:10)
+test(1962.1, DT[ , .SD, .SDcols = integer(0L)], data.table(NULL))
+test(1962.2, DT[ , .SD, .SDcols = character(0L)], data.table(NULL))
+
+# #1878 -- patterns API in .SDcols
+library(data.table)
+DT = data.table(
+  i = 1:10,
+  c = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"),
+  V1 = c(0.4, -0.1, -1.1, -2.6, -0.1, -1.3, 0.3, -2.1, -0.6, 0.9),
+  V2 = c(-0.1, -2.5, -1, -0.1, -0.5, -0.7, -1, -2.1, 2.7, -1.2),
+  V3 = c(1.1, -1.6, 0.7, 1.6, -1.4, 1, -0.6, 1.2, -0.8, 0.1),
+  V4 = c(1.3, -0.8, 2.3, -0.7, 0.5, 0.5, 0.2, 0.7, -1.4, 0.8),
+  V5 = c(-0.1, -0.5, 1.5, -0.5, 1.9, 0.2, -0.1, -0.7, -1.7, -0.9),
+  V6 = c(0.8, -1.3, -0.7, -0.3, 1.4, 0.7, 0.4, 0.3, -1.6, -1.3),
+  V7 = c(-0.1, 0.8, 0.7, -0.2, -2, 0.5, 0.4, -0.2, -1.2, -0.7),
+  V8 = c(0.7, -1, 1.3, 0.5, 0.2, 0.8, 0.6, -1.4, -2, -0.1),
+  V9 = c(0.2, -0.1, 1.2, -0.5, 1.4, 1, 0.2, 0.7, 0.4, 1.6),
+  V10 = c(0.8, 0.7, -1.2, -0.9, -0.6, 0.4, -2.3, 2.2, 0.5, -1.4)
+)
+
+test(1963.1, DT[ , lapply(.SD, sum), .SDcols = patterns('^V')],
+     data.table(V1 = -6.3, V2 = -6.5, V3 = 1.3, V4 = 3.4, V5 = -0.9,
+                V6 = -1.6, V7 = -2, V8 = -0.4, V9 = 6.1, V10 = -1.8))
+# multiple pattens --> intersection of patterns
+test(1963.2, DT[ , lapply(.SD, sum), .SDcols = patterns('^V[0-4]', '^V[5-9]')],
+     data.table(NULL))
+test(1963.3, DT[ , lapply(.SD, sum), .SDcols = patterns('^V[02468]', '^V[48]')],
+     data.table(V4 = 3.4, V8 = -0.4))
+
+# also with !/- inversion
+test(1963.4, DT[ , lapply(.SD, sum), .SDcols = !patterns('^c|i')],
+     data.table(V1 = -6.3, V2 = -6.5, V3 = 1.3, V4 = 3.4, V5 = -0.9,
+                V6 = -1.6, V7 = -2, V8 = -0.4, V9 = 6.1, V10 = -1.8))
+
+
 
 ###################################
 #  Add new tests above this line  #

diff --git a/man/data.table.Rd b/man/data.table.Rd
@@ -141,7 +141,13 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
 
     \item{.SDcols}{ Specifies the columns of \code{x} to be included in the special symbol \code{\link{.SD}} which stands for \code{Subset of data.table}. May be character column names or numeric positions. This is useful for speed when applying a function through a subset of (possible very many) columns; e.g., \code{DT[, lapply(.SD, sum), by="x,y", .SDcols=301:350]}.
 
-    For convenient interactive use, the form \code{startcol:endcol} is also allowed (as in \code{by}), e.g., \code{DT[, lapply(.SD, sum), by=x:y, .SDcols=a:f]}
+    For convenient interactive use, the form \code{startcol:endcol} is also allowed (as in \code{by}), e.g., \code{DT[, lapply(.SD, sum), by=x:y, .SDcols=a:f]}.
+
+    Inversion (column dropping instead of keeping) can be accomplished be prepending the argument with \code{!} or \code{-} (there's no difference between these), e.g. \code{.SDcols = !c('x', 'y')}.
+
+    Finally, you can filter columns to include in \code{.SD} according to regular expressions via \code{.SDcols=patterns(regex1, regex2, ...)}. The included columns will be the \emph{intersection} of the columns identified by each pattern; pattern unions can easily be specified with \code{|} in a regex. You can also invert a pattern as usual with \code{.SDcols = !patterns(...)}.
+
+    Empty \code{.SDcols} will return an empty \code{data.table}.
 }
   \item{verbose}{ \code{TRUE} turns on status and information messages to the console. Turn this on by default using \code{options(datatable.verbose=TRUE)}. The quantity and types of verbosity may be expanded in future.
 
@@ -357,16 +363,18 @@ kDT[!.("a")]                          # not join
 kDT[!"a"]                             # same
 
 # more on special symbols, see also ?"special-symbols"
-DT[.N]                                # last row
-DT[, .N]                              # total number of rows in DT
-DT[, .N, by=x]                        # number of rows in each group
-DT[, .SD, .SDcols=x:y]                # select columns 'x' and 'y'
-DT[, .SD[1]]                          # first row of all columns
-DT[, .SD[1], by=x]                    # first row of 'y' and 'v' for each group in 'x'
-DT[, c(.N, lapply(.SD, sum)), by=x]   # get rows *and* sum columns 'v' and 'y' by group
-DT[, .I[1], by=x]                     # row number in DT corresponding to each group
-DT[, grp := .GRP, by=x]               # add a group counter column
-X[, DT[.BY, y, on="x"], by=x]         # join within each group
+DT[.N]                                  # last row
+DT[, .N]                                # total number of rows in DT
+DT[, .N, by=x]                          # number of rows in each group
+DT[, .SD, .SDcols=x:y]                  # select columns 'x' through 'y'
+DT[ , .SD, .SDcols = !x:y]              # drop columns 'x' through 'y'
+DT[ , .SD, .SDcols = patterns('^[xv]')] # select columns matching '^x' or '^v'
+DT[, .SD[1]]                            # first row of all columns
+DT[, .SD[1], by=x]                      # first row of 'y' and 'v' for each group in 'x'
+DT[, c(.N, lapply(.SD, sum)), by=x]     # get rows *and* sum columns 'v' and 'y' by group
+DT[, .I[1], by=x]                       # row number in DT corresponding to each group
+DT[, grp := .GRP, by=x]                 # add a group counter column
+X[, DT[.BY, y, on="x"], by=x]           # join within each group
 
 # add/update/delete by reference (see ?assign)
 print(DT[, z:=42L])                   # add new column by reference