Skip to content

Commit

Permalink
Closes #1878 and #3185 -- .SDcols accepts patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Chirico committed Dec 5, 2018
1 parent a3dafe3 commit b24f490
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 24 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

4. `NA` in `between`'s `lower` and `upper` are now taken as missing bounds and return `TRUE` rather than than `NA`. This is now documented.

5. `.SDcols` in `[.data.table` now accepts `patterns`, similar to the existing usage in `melt.data.table`, for filtering columns according to a pattern, concisely and dynamically, [#1878](https://github.com/Rdatatable/data.table/issues/1878). Thanks to many people for pushing for this and @MichaelChirico for ultimately filing the PR. See `?data.table` for full details and examples.

#### BUG FIXES

1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting.
Expand Down
8 changes: 7 additions & 1 deletion R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1016,8 +1016,14 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
# .SDcols is of the format a:b
.SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names(x)), parent.frame())
} else {
.SDcols = eval(colsub, parent.frame(), parent.frame())
if (is.call(colsub) && colsub[[1L]] == "patterns") {
# each pattern gives a new filter condition, intersect the end result
.SDcols = Reduce(intersect, do_patterns(colsub, names(x)))
} else {
.SDcols = eval(colsub, parent.frame(), parent.frame())
}
}
if (!length(.SDcols)) return(null.data.table())
if (anyNA(.SDcols))
stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols))))
if (is.logical(.SDcols)) {
Expand Down
13 changes: 1 addition & 12 deletions R/fmelt.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,7 @@ melt.data.table <- function(data, id.vars, measure.vars, variable.name = "variab
if (missing(measure.vars)) measure.vars = NULL
measure.sub = substitute(measure.vars)
if (is.call(measure.sub) && measure.sub[[1L]] == "patterns") {
measure.sub = as.list(measure.sub)[-1L]
idx = which(names(measure.sub) == "cols")
if (length(idx)) {
cols = eval(measure.sub[["cols"]], parent.frame())
measure.sub = measure.sub[-idx]
} else cols = names(data)
pats = lapply(measure.sub, eval, parent.frame())
measure.vars = patterns(pats, cols=cols)
# replace with lengths when R 3.2.0 dependency arrives
if (length(idx <- which(sapply(measure.vars, length) == 0L)))
stop('Pattern', if (length(idx) > 1L) 's', ' not found: [',
paste(pats[idx], collapse = ', '), ']')
measure.vars = do_patterns(measure.sub, names(data))
}
if (is.list(measure.vars) && length(measure.vars) > 1L) {
meas.nm = names(measure.vars)
Expand Down
20 changes: 20 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,23 @@ brackify = function(x) {
if (length(x) > 10L) x = c(x[1:10], '...')
sprintf('[%s]', paste(x, collapse = ', '))
}

# patterns done via NSE in melt.data.table and .SDcols in `[.data.table`
do_patterns = function(pat_sub, all_cols) {
# received as substitute(patterns(...))
pat_sub = as.list(pat_sub)[-1L]
# identify cols = argument if present
idx = which(names(pat_sub) == "cols")
if (length(idx)) {
cols = eval(pat_sub[["cols"]], parent.frame(2L))
pat_sub = pat_sub[-idx]
} else cols = all_cols
pats = lapply(pat_sub, eval, parent.frame(2L))
matched = patterns(pats, cols=cols)
# replace with lengths when R 3.2.0 dependency arrives
if (length(idx <- which(sapply(matched, length) == 0L)))
stop('Pattern', if (length(idx) > 1L) 's', ' not found: [',
paste(pats[idx], collapse = ', '), ']')

return(matched)
}
37 changes: 37 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -12468,6 +12468,43 @@ gs = groupingsets(d, j = sum(val), by = c("a", "b", "c"),
character()), id=TRUE)
test(1961, cb, gs)

# #3185 -- .SDcols = integer(0L) completes gracefully
DT = data.table(a = 1:10)
test(1962.1, DT[ , .SD, .SDcols = integer(0L)], data.table(NULL))
test(1962.2, DT[ , .SD, .SDcols = character(0L)], data.table(NULL))

# #1878 -- patterns API in .SDcols
library(data.table)
DT = data.table(
i = 1:10,
c = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"),
V1 = c(0.4, -0.1, -1.1, -2.6, -0.1, -1.3, 0.3, -2.1, -0.6, 0.9),
V2 = c(-0.1, -2.5, -1, -0.1, -0.5, -0.7, -1, -2.1, 2.7, -1.2),
V3 = c(1.1, -1.6, 0.7, 1.6, -1.4, 1, -0.6, 1.2, -0.8, 0.1),
V4 = c(1.3, -0.8, 2.3, -0.7, 0.5, 0.5, 0.2, 0.7, -1.4, 0.8),
V5 = c(-0.1, -0.5, 1.5, -0.5, 1.9, 0.2, -0.1, -0.7, -1.7, -0.9),
V6 = c(0.8, -1.3, -0.7, -0.3, 1.4, 0.7, 0.4, 0.3, -1.6, -1.3),
V7 = c(-0.1, 0.8, 0.7, -0.2, -2, 0.5, 0.4, -0.2, -1.2, -0.7),
V8 = c(0.7, -1, 1.3, 0.5, 0.2, 0.8, 0.6, -1.4, -2, -0.1),
V9 = c(0.2, -0.1, 1.2, -0.5, 1.4, 1, 0.2, 0.7, 0.4, 1.6),
V10 = c(0.8, 0.7, -1.2, -0.9, -0.6, 0.4, -2.3, 2.2, 0.5, -1.4)
)

test(1963.1, DT[ , lapply(.SD, sum), .SDcols = patterns('^V')],
data.table(V1 = -6.3, V2 = -6.5, V3 = 1.3, V4 = 3.4, V5 = -0.9,
V6 = -1.6, V7 = -2, V8 = -0.4, V9 = 6.1, V10 = -1.8))
# multiple pattens --> intersection of patterns
test(1963.2, DT[ , lapply(.SD, sum), .SDcols = patterns('^V[0-4]', '^V[5-9]')],
data.table(NULL))
test(1963.3, DT[ , lapply(.SD, sum), .SDcols = patterns('^V[02468]', '^V[48]')],
data.table(V4 = 3.4, V8 = -0.4))

# also with !/- inversion
test(1963.4, DT[ , lapply(.SD, sum), .SDcols = !patterns('^c|i')],
data.table(V1 = -6.3, V2 = -6.5, V3 = 1.3, V4 = 3.4, V5 = -0.9,
V6 = -1.6, V7 = -2, V8 = -0.4, V9 = 6.1, V10 = -1.8))



###################################
# Add new tests above this line #
Expand Down
30 changes: 19 additions & 11 deletions man/data.table.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,13 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
\item{.SDcols}{ Specifies the columns of \code{x} to be included in the special symbol \code{\link{.SD}} which stands for \code{Subset of data.table}. May be character column names or numeric positions. This is useful for speed when applying a function through a subset of (possible very many) columns; e.g., \code{DT[, lapply(.SD, sum), by="x,y", .SDcols=301:350]}.
For convenient interactive use, the form \code{startcol:endcol} is also allowed (as in \code{by}), e.g., \code{DT[, lapply(.SD, sum), by=x:y, .SDcols=a:f]}
For convenient interactive use, the form \code{startcol:endcol} is also allowed (as in \code{by}), e.g., \code{DT[, lapply(.SD, sum), by=x:y, .SDcols=a:f]}.
Inversion (column dropping instead of keeping) can be accomplished be prepending the argument with \code{!} or \code{-} (there's no difference between these), e.g. \code{.SDcols = !c('x', 'y')}.

Finally, you can filter columns to include in \code{.SD} according to regular expressions via \code{.SDcols=patterns(regex1, regex2, ...)}. The included columns will be the \emph{intersection} of the columns identified by each pattern; pattern unions can easily be specified with \code{|} in a regex. You can also invert a pattern as usual with \code{.SDcols = !patterns(...)}.

Empty \code{.SDcols} will return an empty \code{data.table}.
}
\item{verbose}{ \code{TRUE} turns on status and information messages to the console. Turn this on by default using \code{options(datatable.verbose=TRUE)}. The quantity and types of verbosity may be expanded in future.

Expand Down Expand Up @@ -357,16 +363,18 @@ kDT[!.("a")] # not join
kDT[!"a"] # same

# more on special symbols, see also ?"special-symbols"
DT[.N] # last row
DT[, .N] # total number of rows in DT
DT[, .N, by=x] # number of rows in each group
DT[, .SD, .SDcols=x:y] # select columns 'x' and 'y'
DT[, .SD[1]] # first row of all columns
DT[, .SD[1], by=x] # first row of 'y' and 'v' for each group in 'x'
DT[, c(.N, lapply(.SD, sum)), by=x] # get rows *and* sum columns 'v' and 'y' by group
DT[, .I[1], by=x] # row number in DT corresponding to each group
DT[, grp := .GRP, by=x] # add a group counter column
X[, DT[.BY, y, on="x"], by=x] # join within each group
DT[.N] # last row
DT[, .N] # total number of rows in DT
DT[, .N, by=x] # number of rows in each group
DT[, .SD, .SDcols=x:y] # select columns 'x' through 'y'
DT[ , .SD, .SDcols = !x:y] # drop columns 'x' through 'y'
DT[ , .SD, .SDcols = patterns('^[xv]')] # select columns matching '^x' or '^v'
DT[, .SD[1]] # first row of all columns
DT[, .SD[1], by=x] # first row of 'y' and 'v' for each group in 'x'
DT[, c(.N, lapply(.SD, sum)), by=x] # get rows *and* sum columns 'v' and 'y' by group
DT[, .I[1], by=x] # row number in DT corresponding to each group
DT[, grp := .GRP, by=x] # add a group counter column
X[, DT[.BY, y, on="x"], by=x] # join within each group

# add/update/delete by reference (see ?assign)
print(DT[, z:=42L]) # add new column by reference
Expand Down

0 comments on commit b24f490

Please sign in to comment.