/
tab_descriptive.R
363 lines (349 loc) · 12.2 KB
/
tab_descriptive.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#' Tabulate counts and proportions
#'
#' @param x a [data.frame()] or [tbl_svy][srvyr::as_survey_design] object
#'
#' @param ... categorical variables to tabulate
#'
#' @param strata a stratifier to split the data
#'
#' @param keep a character vector specifying which values to retain in the
#' tabulation. Defaults to `TRUE`, which keeps all the values.
#'
#' @param drop a character vector specifying which values to drop in the
#' tabulation. Defaults to `NULL`, which keeps all values.
#'
#' @param na.rm When `TRUE` (default), missing (NA) values present in `var`
#' will be removed from the data set with a warning, causing a change in
#' denominator for the tabulations. Setting this to `FALSE` creates an
#' explicit missing value called "(Missing)".
#'
#' @param prop_total if `TRUE` and `strata` is not `NULL`, then the totals of the
#' rows will be reported as proportions of the total data set, otherwise, they
#' will be proportions within the stratum (default).
#'
#' @param row_total create a new column with the total counts for each row of
#' stratified data.
#'
#' @param col_total create a new row with the total counts for each column of
#' stratified data.
#'
#' @param wide if `TRUE` (default) and strata is defined, then the results are
#' presented in a wide table with each stratification counts and estimates in
#' separate columns. If `FALSE`, then the data will be presented in a long
#' format where the counts and estimates are presented in single columns. This
#' has no effect if strata is not defined.
#'
#' @param transpose if `wide = TRUE`, then this will transpose the columns to
#' the rows, which is useful when you stratify by age group. Default is
#' `NULL`, which will not transpose anything. You have three options for
#' transpose:
#' - `transpose = "variable"`: uses the variable column, (dropping values if strata exists).
#' Use this if you know that your values are all identical or at least
#' identifiable by the variable name.
#' - `transpose = "value"` : uses the value column, (dropping variables if strata exists).
#' Use this if your values are important and the variable names are
#' generic placeholders.
#' - `transpose = "both"` : combines the variable and value columns.
#' Use this if both the variables and values are important.
#'
#' @param pretty (survey only) if `TRUE`, default, the proportion and CI are merged
#'
#' @param digits (survey only) if `pretty = FALSE`, this indicates the number
#' of digits used for proportion and CI
#'
#' @param method (survey only) a method from [survey::svyciprop()] to calculate
#' the confidence interval. Defaults to "logit".
#'
#' @param deff a logical indicating if the design effect should be reported.
#' Defaults to `TRUE`.
#'
#' @return a [tibble::tibble()] with a column for variables, a column for values,
#' and counts and proportions. If `strata` is not `NULL` and `wide = TRUE`,
#' then there will be separate columns for each strata for the counts and
#' proportions. Survey data will report confidence intervals.
#'
#' @rdname tab_functions
#' @export
#'
#' @examples
#' have_packages <- require("matchmaker") & require("epidict")
#'
#' if (have_packages) {
#' withAutoprint({
#'
#' # Simulating linelist data
#'
#' linelist <- epidict::gen_data("Measles", numcases = 1000, org = "MSF")
#' measles_dict <- epidict::msf_dict("Measles", compact = FALSE)
#'
#' # Cleaning linelist data
#' linelist_clean <- matchmaker::match_df(
#' x = linelist,
#' dictionary = measles_dict,
#' from = "option_code",
#' to = "option_name",
#' by = "data_element_shortname",
#' order = "option_order_in_set"
#' )
#'
#' # get a descriptive table by sex
#' tab_linelist(linelist_clean, sex)
#'
#' # describe prenancy statistics, but remove missing data from the tally
#' tab_linelist(linelist_clean, trimester, na.rm = TRUE)
#'
#' # describe by symptom
#'
#' tab_linelist(linelist_clean,
#' cough, nasal_discharge, severe_oral_lesions,
#' transpose = "value"
#' )
#' # describe prenancy statistics, stratifying by vitamin A perscription
#' tab_linelist(linelist_clean, trimester, sex,
#' strata = prescribed_vitamin_a,
#' na.rm = TRUE, row_total = TRUE
#' )
#' })
#' }
#'
#' have_survey_packages <- require("survey") && require("srvyr")
#' if (have_survey_packages) {
#' withAutoprint({
#' data(api)
#'
#' # stratified sample
#' surv <- apistrat %>%
#' as_survey_design(strata = stype, weights = pw)
#'
#' s <- surv %>%
#' tab_survey(awards, strata = stype, col_total = TRUE, row_total = TRUE, deff = TRUE)
#' s
#'
#' # making things pretty
#' s %>%
#' # wrap all "n" variables in braces (note space before n).
#' epikit::augment_redundant(" (n)" = " n") %>%
#' # relabel all columns containing "prop" to "% (95% CI)"
#' epikit::rename_redundant(
#' "% (95% CI)" = ci,
#' "Design Effect" = deff
#' )
#'
#' # long data
#' surv %>%
#' tab_survey(awards, strata = stype, wide = FALSE)
#'
#' # tabulate binary variables
#' surv %>%
#' tab_survey(yr.rnd, sch.wide, awards, keep = "Yes")
#'
#' # stratify the binary variables
#' surv %>%
#' tab_survey(yr.rnd, sch.wide, awards,
#' strata = stype,
#' keep = "Yes"
#' )
#'
#' # invert the tabulation
#' surv %>%
#' tab_survey(yr.rnd, sch.wide, awards,
#' strata = stype,
#' drop = "Yes",
#' deff = TRUE,
#' row_total = TRUE
#' )
#' })
#' }
tab_linelist <- function(x,
...,
strata = NULL,
keep = TRUE,
drop = NULL,
na.rm = TRUE,
prop_total = FALSE,
row_total = FALSE,
col_total = FALSE,
wide = TRUE,
transpose = NULL,
digits = 1,
pretty = TRUE) {
tab_general(x,
...,
strata = !!rlang::enquo(strata),
keep = keep,
drop = drop,
na.rm = na.rm,
prop_total = prop_total,
row_total = row_total,
col_total = col_total,
wide = wide,
transpose = transpose,
digits = digits,
pretty = pretty
)
}
#' @rdname tab_functions
#' @export
tab_survey <- function(x,
...,
strata = NULL,
keep = TRUE,
drop = NULL,
na.rm = TRUE,
prop_total = FALSE,
row_total = FALSE,
col_total = FALSE,
wide = TRUE,
transpose = NULL,
digits = 1,
method = "logit",
deff = FALSE,
pretty = TRUE) {
tab_general(x,
...,
strata = !!rlang::enquo(strata),
keep = keep,
drop = drop,
na.rm = na.rm,
prop_total = prop_total,
row_total = row_total,
col_total = col_total,
wide = wide,
transpose = transpose,
digits = digits,
method = method,
deff = deff,
pretty = pretty
)
}
tab_general <- function(x,
...,
strata = NULL,
keep = TRUE,
drop = NULL,
na.rm = TRUE,
prop_total = FALSE,
row_total = FALSE,
col_total = FALSE,
wide = TRUE,
transpose = NULL,
digits = 1,
method = "logit",
deff = FALSE,
pretty = TRUE) {
is_survey <- inherits(x, "tbl_svy")
stopifnot(is_survey || is.data.frame(x))
# We try to match the user-supplied variables to the colnames. If the user
# supplied a tidyselect verb (e.g. `starts_with("CHOICE")`, then it should
# filter properly.
xnames <- colnames(x)
names(xnames) <- xnames
# 2020-02-10
#
# tidyselect has updated when I was on vacation and changed its behavior. It
# used to return nothing if one of the columns did not exist, which we could
# fix by wrapping the call in one_of and report which columns were not found,
# but now it's going to take some rethinking about how to handle this
# properly, so at the moment, we are sliently ignoring columns that don't
# match.
vars <- tidyselect::eval_select(rlang::expr(c(...)), data = xnames, strict = FALSE)
vars <- xnames[vars]
if (length(vars) == 0) {
stop("No columns matched the data", call. = FALSE)
}
stra <- rlang::enquo(strata)
flip_it <- wide && !is.null(transpose)
if (flip_it) {
transpose <- match.arg(tolower(transpose), c("variable", "value", "both"))
}
# Create list for results to go into that will eventually be bound together
res <- vector(mode = "list", length = length(vars))
names(res) <- vars
# loop over each name in the list and tabulate the survey for that variable
for (i in names(res)) {
i <- rlang::ensym(i)
if (is_survey) {
res[[i]] <- tabulate_survey(x,
var = !!i,
strata = !!stra,
proptotal = prop_total,
coltotals = col_total,
rowtotals = row_total,
pretty = pretty,
digits = digits,
method = method,
wide = wide,
na.rm = na.rm,
deff = deff
)
} else {
res[[i]] <- descriptive(x,
counter = !!i,
grouper = !!stra,
proptotal = prop_total,
coltotals = col_total,
rowtotals = row_total,
digits = digits,
explicit_missing = !na.rm
)
}
# The ouptut columns will have the value as whatever i was, so we should
# rename this to "value" to make it consistent
names(res[[i]])[names(res[[i]]) == i] <- "value"
res[[i]][["value"]] <- as.character(res[[i]][["value"]])
}
# Combine the results into one table
suppressWarnings(res <- dplyr::bind_rows(res, .id = "variable"))
# return the results with only the selected values
if (!isTRUE(keep) && !is.null(drop)) {
stop("you can only choose to keep values or drop values. Specifying both is not allowed", call. = FALSE)
}
strata_exists <- tidyselect::vars_select(colnames(x), !!stra)
strata_exists <- length(strata_exists) > 0
if (!isTRUE(keep)) {
res <- res[res$value %in% keep, , drop = FALSE]
} else if (!is.null(drop)) {
res <- res[!res$value %in% drop, , drop = FALSE]
} else if (flip_it && !strata_exists && transpose != "both") {
flip_it <- FALSE
# This is the situation where the user doesn't have a stratafying variable,
# but they want to transpose either the variable or value.
the_column <- if (transpose == "variable") "value" else "variable"
res[[the_column]] <- forcats::fct_inorder(res[[the_column]])
res[[transpose]] <- forcats::fct_inorder(res[[transpose]])
res <- widen_tabulation(res,
!!rlang::sym(the_column),
!!rlang::sym(transpose),
pretty = if (is_survey) pretty else FALSE,
digits = digits
)
if (col_total && the_column == "value") {
# prevent Total from appearing as one of the middle rows
res[["value"]] <- forcats::fct_relevel(res[["value"]], "Total", after = Inf)
res <- res[order(res[["value"]]), ]
}
if (col_total && the_column == "variable") {
# prevent Total from appearing as one of the middle columns
good_order <- c(
grep("Total", names(res), invert = TRUE),
grep("Total", names(res))
)
res <- res[good_order]
}
} else {
if (flip_it) {
warning("Cannot transpose data that hasn't been filtered with keep or drop", call. = FALSE)
}
flip_it <- FALSE
}
# If the user wants to transpose the data, then we need to do this for each
# level of data available into separate tables, combine the columns, and then
# rearrange them so that they are grouped by variable/value
if (flip_it) {
res <- flipper(if (is_survey) x$variables else x,
res, transpose,
pretty = pretty, stra = stra
)
}
res
}