-
Notifications
You must be signed in to change notification settings - Fork 1
/
cat2cat_agg.R
164 lines (149 loc) · 5.34 KB
/
cat2cat_agg.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#' Manual mapping for an aggregated panel dataset
#' @description Manual mapping of an inconsistently coded categorical variable
#' according to the user provided mappings (equations).
#' @param data list with 5 named fields
#' `old`, `new`, `cat_var`, `time_var`, `freq_var`.
#' @param ... mapping equations where direction is set with any of,
#' `>`, `<`, `\%>\%`, `\%<\%`.
#' @return `named list` with 2 fields old and new - 2 data.frames.
#' There will be added additional columns to each.
#' The new columns are added instead of the additional metadata as
#' we are working with new datasets
#' where observations could be replicated.
#' For the transparency the probability and number of replications are part of
#' each observation in the `data.frame`.
#' @details data argument - list with fields
#' \describe{
#' \item{"old"}{ data.frame older time point in the panel}
#' \item{"new"}{ data.frame more recent time point in the panel}
#' \item{"cat_var"}{
#' character - deprecated - name of the categorical variable
#' }
#' \item{"cat_var_old"}{
#' character name of the categorical variable in the old period
#' }
#' \item{"cat_var_new"}{
#' character name of the categorical variable in the new period
#' }
#' \item{"time_var"}{ character name of time variable}
#' \item{"freq_var"}{ character name of frequency variable}
#' }
#' @note All mapping equations have to be valid ones.
#' @export
#' @examples
#' data("verticals", package = "cat2cat")
#' agg_old <- verticals[verticals$v_date == "2020-04-01", ]
#' agg_new <- verticals[verticals$v_date == "2020-05-01", ]
#'
#' # cat2cat_agg - can map in both directions at once
#' # although usually we want to have the old or the new representation
#'
#' agg <- cat2cat_agg(
#' data = list(
#' old = agg_old,
#' new = agg_new,
#' cat_var_old = "vertical",
#' cat_var_new = "vertical",
#' time_var = "v_date",
#' freq_var = "counts"
#' ),
#' Automotive %<% c(Automotive1, Automotive2),
#' c(Kids1, Kids2) %>% c(Kids),
#' Home %>% c(Home, Supermarket)
#' )
#'
#' ## possible processing
#' library("dplyr")
#' agg %>%
#' bind_rows() %>%
#' group_by(v_date, vertical) %>%
#' summarise(
#' sales = sum(sales * prop_c2c),
#' counts = sum(counts * prop_c2c),
#' v_date = first(v_date)
#' )
cat2cat_agg <- function(data = list(
old = NULL,
new = NULL,
cat_var_old = NULL,
cat_var_new = NULL,
time_var = NULL,
freq_var = NULL
), ...) {
if (!is.null(data$cat_var)) {
data$cat_var_old <- data$cat_var
data$cat_var_new <- data$cat_var
}
stopifnot(
is.list(data) &&
(length(data) >= 5 || length(data) <= 6) &&
all(vapply(data, Negate(is.null), logical(1))) &&
inherits(data$old, "data.frame") &&
inherits(data$new, "data.frame") &&
all(
c(
"old", "new", "cat_var_old", "cat_var_new",
"time_var", "freq_var"
) %in% names(data)
) &&
all(c(data$cat_var, data$freq_var) %in% colnames(data$old)) &&
all(c(data$cat_var, data$freq_var) %in% colnames(data$new))
)
d_old <- length(unique(data$old[[data$time_var]]))
d_new <- length(unique(data$new[[data$time_var]]))
stopifnot((d_old == 1) && (d_new == 1))
stopifnot(all(table(data$old[[data$cat_var_old]]) == 1) &&
all(table(data$new[[data$cat_var_new]]) == 1))
t <- as.list(substitute(list(...))[-1])
trans <- do.call(read_eq, t)
trans_map <- lapply(trans, format_trans)
old_cats <- unlist(lapply(trans_map, function(x) x[["old"]]))
new_cats <- unlist(lapply(trans_map, function(x) x[["new"]]))
stopifnot(all(old_cats %in% unique(data[["old"]][[data$cat_var_old]])))
stopifnot(all(new_cats %in% unique(data[["new"]][[data$cat_var_new]])))
df_old <- data$old
df_old$prop_c2c <- 1
col_df_old <- colnames(df_old)
df_new <- data$new
df_new$prop_c2c <- 1
col_df_new <- colnames(df_new)
stopifnot(
identical(
setdiff(col_df_old, data$cat_var_old),
setdiff(col_df_new, data$cat_var_new)
)
)
for (i in trans_map) {
stopifnot(((length(i[[2]]) == 1) || (length(i[[3]]) == 1)))
if (i$direction == "forward") {
base <- df_new[!(df_new[, data$cat_var_new] %in% i[[3]]), ]
base_rm <- df_new[df_new[, data$cat_var_new] %in% i[[3]], ]
if (length(i[[2]]) > 1) {
base_rm <- base_rm[rep(1, length(i[[2]])), ]
base_rm[, data$cat_var_new] <- i[[2]]
base_rm$prop_c2c <- prop.table(
df_old[df_old[, data$cat_var_old] %in% i[[2]], data$freq_var]
)
} else {
base_rm[, data$cat_var_new] <- i[[2]]
base_rm$prop_c2c <- 1
}
df_new <- rbind(base, base_rm)
} else if (i$direction == "backward") {
base <- df_old[!(df_old[, data$cat_var_old] %in% i[[2]]), ]
base_rm <- df_old[df_old[, data$cat_var_old] %in% i[[2]], ]
if (length(i[[3]]) > 1) {
base_rm <- base_rm[rep(1, length(i[[3]])), ]
base_rm[, data$cat_var_old] <- i[[3]]
base_rm$prop_c2c <- prop.table(
df_new[df_new[, data$cat_var_new] %in% i[[3]], data$freq_var]
)
} else {
base_rm[, data$cat_var_old] <- i[[3]]
base_rm$prop_c2c <- 1
}
df_old <- rbind(base, base_rm)
}
}
list(old = df_old, new = df_new)
}