-
Notifications
You must be signed in to change notification settings - Fork 969
/
fcast.R
255 lines (247 loc) · 12.5 KB
/
fcast.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
guess = function(x) {
if ("value" %chin% names(x))
return("value")
if ("(all)" %chin% names(x))
return("(all)")
var = names(x)[ncol(x)]
messagef("Using '%s' as value column. Use 'value.var' to override", var)
return(var)
}
dcast <- function(
data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, value.var = guess(data)
) {
if (is.data.table(data)) UseMethod("dcast", data)
# nocov start
else {
data_name = deparse(substitute(data))
ns = tryCatch(getNamespace("reshape2"), error=function(e)
stopf("The %1$s generic in data.table has been passed a %2$s, but data.table::%1$s currently only has a method for data.tables. Please confirm your input is a data.table, with setDT(%3$s) or as.data.table(%3$s). If you intend to use a method from reshape2, try installing that package first, but do note that reshape2 is superseded and is no longer actively developed.", "dcast", class(data)[1L], data_name))
warningf("The %1$s generic in data.table has been passed a %2$s and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is superseded and is no longer actively developed, and this redirection is now deprecated. Please do this redirection yourself like reshape2::%1$s(%3$s). In the next version, this warning will become an error.", "dcast", class(data)[1L], data_name)
ns$dcast(data, formula, fun.aggregate = fun.aggregate, ..., margins = margins,
subset = subset, fill = fill, value.var = value.var)
}
# nocov end
}
check_formula = function(formula, varnames, valnames, value.var.in.LHSdots, value.var.in.RHSdots) {
if (is.character(formula)) formula = as.formula(formula)
if (!inherits(formula, "formula") || length(formula) != 3L)
stopf("Invalid formula. Cast formula should be of the form LHS ~ RHS, for e.g., a + b ~ c.") # nocov; couldn't find a way to construct a test formula with length!=3L
vars = all.vars(formula)
vars = vars[!vars %chin% c(".", "...")]
allvars = c(vars, valnames)
vars = setdiff(vars, valnames)
allvarsBL = list(vars, allvars)
if (any(allvars %chin% varnames[duplicated(varnames)]))
stopf('data.table to cast must have unique column names')
if (value.var.in.LHSdots == value.var.in.RHSdots && isFALSE(value.var.in.LHSdots)) {
deparse_formula(as.list(formula)[-1L], varnames, allvars)
}
else {
split_deparsing(as.list(formula)[-1L], varnames, allvarsBL[[2L - value.var.in.LHSdots]], allvarsBL[[2L - value.var.in.RHSdots]])
}
}
split_deparsing = function(expr, varnames, LHSallvars, RHSallvars) {
expr = list(list(expr[[1L]], LHSallvars), list(expr[[2L]], c(RHSallvars))) # assume expr[[1L]] is LHS and expr[[2L]] is RHS
lvars = lapply(expr, function(thisList) {
this = thisList[[1L]]
allvars = thisList[[2L]]
if (!is.language(this)) return(NULL)
if (this %iscall% '+') return(unlist(deparse_formula(this[-1L], varnames, allvars)))
if (is.name(this) && this == quote(`...`)) {
subvars = setdiff(varnames, allvars)
return(lapply(subvars, as.name))
}
this
})
lvars = lapply(lvars, function(x) if (length(x) && !is.list(x)) list(x) else tryCatch(unique(x), error = function(e) x))
}
deparse_formula = function(expr, varnames, allvars) {
lvars = lapply(expr, function(this) {
if (!is.language(this)) return(NULL)
if (this %iscall% '+') return(unlist(deparse_formula(this[-1L], varnames, allvars)))
if (is.name(this) && this == quote(`...`)) {
subvars = setdiff(varnames, allvars)
return(lapply(subvars, as.name))
}
this
})
lvars = lapply(lvars, function(x) if (length(x) && !is.list(x)) list(x) else x)
}
value_vars = function(value.var, varnames) {
if (is.character(value.var))
value.var = list(value.var)
value.var = lapply(value.var, unique)
valnames = unique(unlist(value.var))
iswrong = which(!valnames %chin% varnames)
if (length(iswrong))
stopf("value.var values %s are not found in 'data'.", brackify(value.var[iswrong]))
value.var
}
aggregate_funs = function(funs, vals, sep="_", ...) {
if (funs %iscall% 'eval')
funs = eval(funs[[2L]], parent.frame(2L), parent.frame(2L))
if (funs %iscall% c('c', 'list')) {
funs = lapply(as.list(funs)[-1L], function(x) {
if (x %iscall% c('c', 'list')) as.list(x)[-1L] else x
})
} else funs = eval(funs, parent.frame(2L), parent.frame(2L))
if(is.function(funs)) funs = list(funs) # needed for cases as shown in test#1700.1
if (length(funs) != length(vals)) {
if (length(vals) == 1L)
vals = replicate(length(funs), vals)
else stopf("When 'fun.aggregate' and 'value.var' are both lists, 'value.var' must be either of length =1 or =length(fun.aggregate).")
}
only_one_fun = length(unlist(funs)) == 1L
dots = list(...)
construct_funs = function(fun, nm, val) {
ans = vector("list", length(fun)*length(val))
nms = vector("character", length(ans))
k = 1L
for (i in fun) {
for (j in val) {
expr = list(i, as.name(j))
if (length(dots))
expr = c(expr, dots)
ans[[k]] = as.call(expr)
# changed order of arguments here, #1153
nms[k] = if (only_one_fun) j else paste(j, nm, sep=sep)
k = k+1L;
}
}
setattr(ans, 'names', nms)
}
ans = lapply(seq_along(funs), function(i) {
nm = names(funs[i])
if (is.null(nm) || !nzchar(nm)) {
nm = all.names(funs[[i]], max.names=1L, functions=TRUE)
}
if (!length(nm)) nm <- paste0("fun", i)
construct_funs(funs[i], nm, vals[[i]])
})
as.call(c(quote(list), unlist(ans)))
}
dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose"), value.var.in.dots = FALSE, value.var.in.LHSdots = value.var.in.dots, value.var.in.RHSdots = value.var.in.dots) {
if (!is.data.table(data)) stopf("'data' must be a data.table.")
drop = as.logical(rep(drop, length.out=2L))
if (anyNA(drop)) stopf("'drop' must be logical TRUE/FALSE")
if (!isTRUEorFALSE(value.var.in.dots))
stopf("Argument 'value.var.in.dots' should be logical TRUE/FALSE")
if (!isTRUEorFALSE(value.var.in.LHSdots) || !isTRUEorFALSE(value.var.in.RHSdots))
stopf("Arguments 'value.var.in.LHSdots', 'value.var.in.RHSdots' should be logical TRUE/FALSE")
# #2980 if explicitly providing fun.aggregate=length but not a value.var,
# just use the last column (as guess(data) would do) because length will be
# the same on all columns
if (missing(value.var) && !missing(fun.aggregate) && identical(fun.aggregate, length))
value.var = names(data)[ncol(data)]
lvals = value_vars(value.var, names(data))
valnames = unique(unlist(lvals))
lvars = check_formula(formula, names(data), valnames, value.var.in.LHSdots, value.var.in.RHSdots)
lvars = lapply(lvars, function(x) if (length(x)) x else quote(`.`))
# tired of lapply and the way it handles environments!
allcols = c(unlist(lvars), lapply(valnames, as.name))
dat = vector("list", length(allcols))
for (i in seq_along(allcols)) {
x = allcols[[i]]
dat[[i]] = if (identical(x, quote(`.`))) rep(".", nrow(data)) else eval(x, data, parent.frame())
if (is.function(dat[[i]]))
stopf("Column [%s] not found or of unknown type.", deparse(x))
}
setattr(lvars, 'names', c("lhs", "rhs"))
# Have to take care of duplicate names, and provide names for expression columns properly.
varnames = make.unique(vapply_1c(unlist(lvars), all.vars, max.names=1L), sep=sep)
dupidx = which(valnames %chin% varnames)
if (length(dupidx)) {
dups = valnames[dupidx]
valnames = tail(make.unique(c(varnames, valnames)), -length(varnames))
lvals = lapply(lvals, function(x) { x[x %chin% dups] = valnames[dupidx]; x })
}
lhsnames = head(varnames, length(lvars$lhs))
rhsnames = tail(varnames, -length(lvars$lhs))
setattr(dat, 'names', c(varnames, valnames))
if (any(vapply_1b(dat[varnames], is.list))) {
stopf("Columns specified in formula can not be of type list")
}
setDT(dat)
m = as.list(match.call()[-1L])
subset = m[["subset"]][[2L]]
if (!is.null(subset)) {
if (is.name(subset)) subset = as.call(list(quote(`(`), subset))
idx = which(eval(subset, data, parent.frame())) # any advantage thro' secondary keys?
dat = .Call(CsubsetDT, dat, idx, seq_along(dat))
}
fun.call = m[["fun.aggregate"]]
if (is.null(fun.call)) {
oo = forderv(dat, by=varnames, retGrp=TRUE)
if (attr(oo, 'maxgrpn', exact=TRUE) > 1L) {
messagef("'fun.aggregate' is NULL, but found duplicate row/column combinations, so defaulting to length(). That is, the variables %s used in 'formula' do not uniquely identify rows in the input 'data'. In such cases, 'fun.aggregate' is used to derive a single representative value for each combination in the output data.table, for example by summing or averaging (fun.aggregate=sum or fun.aggregate=mean, respectively). Check the resulting table for values larger than 1 to see which combinations were not unique. See ?dcast.data.table for more details.", brackify(varnames))
fun.call = quote(length)
}
}
dat_for_default_fill = dat
run_agg_funs = !is.null(fun.call)
if (run_agg_funs) {
fun.call = aggregate_funs(fun.call, lvals, sep, ...)
maybe_err = function(list.of.columns) {
if (any(lengths(list.of.columns) != 1L)) stopf("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.")
list.of.columns
}
dat = dat[, maybe_err(eval(fun.call)), by=c(varnames)]
}
order_ = function(x) {
o = forderv(x, retGrp=TRUE, sort=TRUE)
idx = attr(o, 'starts', exact=TRUE)
if (!length(o)) o = seq_along(x)
o[idx] # subsetVector retains attributes, using R's subset for now
}
cj_uniq = function(DT) {
do.call("CJ", lapply(DT, function(x)
if (is.factor(x)) {
xint = seq_along(levels(x))
setattr(xint, 'levels', levels(x))
setattr(xint, 'class', class(x))
} else .Call(CsubsetVector, x, order_(x))
))}
valnames = setdiff(names(dat), varnames)
# 'dat' != 'data'? then setkey to speed things up (slightly), else ad-hoc (for now). Still very fast!
if (!is.null(fun.call) || !is.null(subset))
setkeyv(dat, varnames)
if (length(rhsnames)) {
lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames)
# handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
if (all(drop)) {
map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE))) # #2202 fix
maporder = lapply(map, order_)
mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]]))
lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs))
rhs = .Call(CsubsetDT, rhs, maporder[[2L]], seq_along(rhs))
} else {
lhs_ = if (!drop[1L]) cj_uniq(lhs) else setkey(unique(lhs, by=names(lhs)))
rhs_ = if (!drop[2L]) cj_uniq(rhs) else setkey(unique(rhs, by=names(rhs)))
map = vector("list", 2L)
.Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE])
.Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE])
setDT(map)
mapunique = vector("list", 2L)
.Call(Csetlistelt, mapunique, 1L, seq_len(nrow(lhs_)))
.Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
lhs = lhs_; rhs = rhs_
}
maplen = lengths(mapunique)
idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join.
some_fill = anyNA(idx)
fill.default = if (run_agg_funs && is.null(fill) && some_fill) dat_for_default_fill[, maybe_err(eval(fun.call))]
if (run_agg_funs && is.null(fill) && some_fill) {
fill.default = dat_for_default_fill[0L][, maybe_err(eval(fun.call))]
}
ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call), some_fill)
allcols = do.call("paste", c(rhs, sep=sep))
if (length(valnames) > 1L)
allcols = do.call("paste", if (identical(".", allcols)) list(valnames, sep=sep)
else c(CJ(valnames, allcols, sorted=FALSE), sep=sep))
# removed 'setcolorder()' here, #1153
setattr(ans, 'names', c(lhsnames, allcols))
setDT(ans); setattr(ans, 'sorted', lhsnames)
} else stopf("Internal error -- empty rhsnames in dcast; please report") # nocov
return (ans)
}