-
Notifications
You must be signed in to change notification settings - Fork 969
/
merge.R
134 lines (128 loc) · 5.97 KB
/
merge.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all,
all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) {
if (!sort %in% c(TRUE, FALSE))
stopf("Argument 'sort' should be logical TRUE/FALSE")
if (!no.dups %in% c(TRUE, FALSE))
stopf("Argument 'no.dups' should be logical TRUE/FALSE")
class_x = class(x)
if (!is.data.table(y)) {
y = as.data.table(y)
if (missing(by) && missing(by.x)) {
by = key(x)
}
}
x0 = length(x)==0L
y0 = length(y)==0L
if (x0 || y0) {
if (x0 && y0)
warningf("Neither of the input data.tables to join have columns.")
else if (x0)
warningf("Input data.table '%s' has no columns.", "x")
else
warningf("Input data.table '%s' has no columns.", "y")
}
nm_x = names(x)
nm_y = names(y)
if (anyDuplicated(nm_x)) stopf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "x", brackify(nm_x[duplicated(nm_x)]))
if (anyDuplicated(nm_y)) stopf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "y", brackify(nm_y[duplicated(nm_y)]))
## set up 'by'/'by.x'/'by.y'
if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
stopf("`by.x` and `by.y` must be of same length.")
if (!missing(by) && !missing(by.x))
warningf("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
if (!is.null(by.x)) {
if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
if (!all(by.x %chin% nm_x))
stopf("Elements listed in `by.x` must be valid column names in x.")
if (!all(by.y %chin% nm_y))
stopf("Elements listed in `by.y` must be valid column names in y.")
by = by.x
names(by) = by.y
} else {
if (is.null(by))
by = intersect(key(x), key(y))
if (!length(by)) # was is.null() before PR#5183 changed to !length()
by = key(x)
if (!length(by))
by = intersect(nm_x, nm_y)
if (length(by) == 0L || !is.character(by))
stopf("A non-empty vector of column names for `by` is required.")
if (!all(by %chin% intersect(nm_x, nm_y)))
stopf("Elements listed in `by` must be valid column names in x and y")
by = unname(by)
by.x = by.y = by
}
# warn about unused arguments #2587
if (length(list(...))) {
ell = as.list(substitute(list(...)))[-1L]
for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n)
unnamed_n = length(ell) - sum(names(ell) != "")
if (unnamed_n)
warningf("Passed %d unknown and unnamed arguments.", unnamed_n)
}
# with i. prefix in v1.9.3, this goes away. Left here for now ...
## sidestep the auto-increment column number feature-leading-to-bug by
## ensuring no names end in ".1", see unit test
## "merge and auto-increment columns in y[x]" in test-data.frame.like.R
start = setdiff(nm_x, by.x)
end = setdiff(nm_y, by.y)
dupnames = intersect(start, end)
if (length(dupnames)) {
start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L])
end[chmatch(dupnames, end, 0L)] = paste0(dupnames, suffixes[2L])
}
# If no.dups = TRUE we also need to added the suffix to columns in y
# that share a name with by.x
dupkeyx = intersect(by.x, end)
if (no.dups && length(dupkeyx)) {
end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L])
}
# implement incomparables argument #2587
if (!is.null(incomparables)) {
# %fin% to be replaced when #5232 is implemented/closed
"%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table
xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by)
yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by)
# subset both so later steps still work
x = x[xind]
y = y[yind]
}
dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names)
if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed
# Perhaps not very commonly used, so not a huge deal that the join is redone here.
missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian]
# TO DO: replace by following once #5446 is merged
# if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE)
if (length(missingyidx)) {
yy = y[missingyidx]
othercolsx = setdiff(nm_x, by)
if (length(othercolsx)) {
# create NA rectangle with correct types and attributes of x to cbind to y
tmp = rep.int(NA_integer_, length(missingyidx))
# TO DO: use set() here instead..
yy = cbind(yy, x[tmp, othercolsx, with = FALSE])
}
# empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist
# takes care of #24 without having to save names. This is how it should be, IMHO.
dt = rbind(dt, yy, use.names=FALSE)
}
}
# X[Y] syntax puts JIS i columns at the end, merge likes them alongside i.
newend = setdiff(nm_y, by.y)
# fix for #1290, make sure by.y order is set properly before naming
setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend))
setnames(dt, c(by.x, start, end))
if (nrow(dt) > 0L) {
setkeyv(dt, if (sort) by.x else NULL)
}
# Throw warning if there are duplicate column names in 'dt' (i.e. if
# `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
resultdupnames = names(dt)[duplicated(names(dt))]
if (length(resultdupnames)) {
warningf("column names %s are duplicated in the result", brackify(resultdupnames))
}
# retain custom classes of first argument that resulted in dispatch to this method, #1378
setattr(dt, "class", class_x)
dt
}