/
add_weights_cluster.R
263 lines (226 loc) · 9.77 KB
/
add_weights_cluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#' Add a column of cluster survey weights to a data frame.
#'
#' For use in surveys where you took a sample population out of a larger
#' source population, with a cluster survey design.
#'
#' Will multiply the inverse chances of a cluster being selected, a household
#' being selected within a cluster, and an individual being selected within a
#' household.
#'
#' As follows:
#'
#' ```
#' ((clusters available) / (clusters surveyed)) *
#' ((households in each cluster) / (households surveyed in each cluster)) *
#' ((individuals eligible in each household) / (individuals interviewed))
#' ```
#'
#' In the case where both ignore_cluster and ignore_household are TRUE, this
#' will simply be:
#'
#' ```
#' 1 * 1 * (individuals eligible in each household) / (individuals interviewed)
#' ```
#'
#' @param x a data frame of survey data
#'
#' @param cl a data frame containing a list of clusters and the number of
#' households in each.
#'
#' @param eligible the column in `x` which specifies the number of people
#' eligible for being interviewed in that household. (e.g. the total number of
#' children)
#'
#' @param interviewed the column in `x` which specifies the number of people
#' actually interviewed in that household.
#'
#' @param cluster_cl the column in `cl` that lists all possible clusters.
#' Ignored if `ignore_cluster` is TRUE.
#'
#' @param cluster_x the column in `x` that indicates which cluster rows belong
#' to. Ignored if `ignore_cluster` is TRUE.
#'
#' @param household_cl the column in `cl` that lists the number of households
#' per cluster. Ignored if `ignore_household` is TRUE.
#'
#' @param household_x the column in `x` that indicates a unique household
#' identifier. Ignored if `ignore_household` is TRUE.
#'
#' @param ignore_cluster If TRUE (default), set the weight for clusters to be 1.
#' This assumes that your sample was taken in a way which is a close
#' approximation of a simple random sample. Ignores inputs from `cluster_cl`
#' as well as `cluster_x`.
#'
#' @param ignore_household If TRUE (default), set the weight for households to
#' be 1. This assumes that your sample of households was takenin a way which
#' is a close approximation of a simple random sample. Ignores inputs from
#' `household_cl` and `household_x`.
#'
#' @param surv_weight the name of the new column to store the weights. Defaults
#' to "surv_weight".
#'
#' @param surv_weight_ID the name of the new ID column to be created. Defaults
#' to "surv_weight_ID"
#'
#' @author Alex Spina, Zhian N. Kamvar, Lukas Richter
#' @export
#' @examples
#'
#'
#' # define a fake dataset of survey data
#' # including household and individual information
#' x <- data.frame(stringsAsFactors=FALSE,
#' cluster = c("Village A", "Village A", "Village A", "Village A",
#' "Village A", "Village B", "Village B", "Village B"),
#' household_id = c(1, 1, 1, 1, 2, 2, 2, 2),
#' eligible_n = c(6, 6, 6, 6, 6, 3, 3, 3),
#' surveyed_n = c(4, 4, 4, 4, 4, 3, 3, 3),
#' individual_id = c(1, 2, 3, 4, 4, 1, 2, 3),
#' age_grp = c("0-10", "20-30", "30-40", "50-60", "50-60", "20-30",
#' "50-60", "30-40"),
#' sex = c("Male", "Female", "Male", "Female", "Female", "Male",
#' "Female", "Female"),
#' outcome = c("Y", "Y", "N", "N", "N", "N", "N", "Y")
#' )
#'
#' # define a fake dataset of cluster listings
#' # including cluster names and number of households
#' cl <- tibble::tribble(
#' ~cluster, ~n_houses,
#' "Village A", 23,
#' "Village B", 42,
#' "Village C", 56,
#' "Village D", 38
#' )
#'
#'
#' # add weights to a cluster sample
#' # include weights for cluster, household and individual levels
#' add_weights_cluster(x, cl = cl,
#' eligible = eligible_n,
#' interviewed = surveyed_n,
#' cluster_cl = cluster, household_cl = n_houses,
#' cluster_x = cluster, household_x = household_id,
#' ignore_cluster = FALSE, ignore_household = FALSE)
#'
#'
#' # add weights to a cluster sample
#' # ignore weights for cluster and household level (set equal to 1)
#' # only include weights at individual level
#' add_weights_cluster(x, cl = cl,
#' eligible = eligible_n,
#' interviewed = surveyed_n,
#' cluster_cl = cluster, household_cl = n_houses,
#' cluster_x = cluster, household_x = household_id,
#' ignore_cluster = TRUE, ignore_household = TRUE)
#'
add_weights_cluster <- function(x, cl,
eligible, interviewed,
cluster_x = NULL, cluster_cl = NULL,
household_x = NULL, household_cl = NULL,
ignore_cluster = TRUE, ignore_household = TRUE,
surv_weight = "surv_weight",
surv_weight_ID = "surv_weight_ID") {
# define vars
clus_id_cl <- tidyselect::vars_select(colnames(cl), {{cluster_cl}}, .strict = FALSE)
hh_id_cl <- tidyselect::vars_select(colnames(cl), {{household_cl}}, .strict = FALSE)
clus_id_x <- tidyselect::vars_select(colnames(x), {{cluster_x}}, .strict = FALSE)
hh_id_x <- tidyselect::vars_select(colnames(x), {{household_x}}, .strict = FALSE)
indiv_eli <- tidyselect::vars_select(colnames(x), {{eligible}}, .strict = FALSE)
indiv_surv <- tidyselect::vars_select(colnames(x), {{interviewed}}, .strict = FALSE)
# pull the environment from which this was called (i.e. the function)
cll <- match.call()
## checks (errors / warnings)
# check if variables are in datasets (spelling) - throws error
# for cluster dataset
if (length(clus_id_cl) == 0) {
# put the variable of interest in quotations
ppltn <- rlang::as_name(rlang::enquo(cluster_cl))
# throw error saying which variable not in which dataset
stop(glue::glue("{ppltn} is not one of the columns in {cll[['cl']]}, check spelling"))
}
if (length(hh_id_cl) == 0) {
ppltn <- rlang::as_name(rlang::enquo(household_cl))
stop(glue::glue("{ppltn} is not one of the columns in {cll[['cl']]}, check spelling"))
}
# for study dataset
if (length(clus_id_x) == 0) {
ppltn <- rlang::as_name(rlang::enquo(cluster_x))
stop(glue::glue("{ppltn} is not one of the columns in {cll[['x']]}, check spelling"))
}
if (length(hh_id_x) == 0) {
ppltn <- rlang::as_name(rlang::enquo(household_x))
stop(glue::glue("{ppltn} is not one of the columns in {cll[['x']]}, check spelling"))
}
if (length(indiv_eli) == 0) {
ppltn <- rlang::as_name(rlang::enquo(eligible))
stop(glue::glue("{ppltn} is not one of the columns in {cll[['x']]}, check spelling"))
}
if (length(indiv_surv) == 0) {
ppltn <- rlang::as_name(rlang::enquo(interviewed))
stop(glue::glue("{ppltn} is not one of the columns in {cll[['x']]}, check spelling"))
}
# check if there are duplicate cluster names (i.e. counted twice) - throws error
if (any(duplicated(cl[[clus_id_cl]]))) {
stop(glue::glue("Cluster names are duplicated in {clus_id_cl} variable of {cll[['cl']]} dataset"))
}
# check how many cluster names in the study data match cluster counts data
non_matching_clusters <- !x[[clus_id_x]] %in% cl[[clus_id_cl]]
if (any(non_matching_clusters)) {
nm_cluster_names <- unique(x[[clus_id_x]][non_matching_clusters])
warning(glue::glue("The following cluster names in the {clus_id_x} variable are not in the {cll[['cl']]} dataset: ",
glue::glue_collapse(nm_cluster_names, sep = ",")))
}
## start function
if (ignore_cluster) {
cluster_chance <- 1
} else {
## chance of a cluster being selected
# the total number of clusters in the area
n_clusters_total <- length(unique(cl[[clus_id_cl]]))
# the number of clusters that were surveyed
n_clusters_chosen <- length(unique(x[[clus_id_x]]))
# inverse chance of a cluster being chosen
cluster_chance <- n_clusters_total / n_clusters_chosen
}
if (ignore_household) {
cl$hh_chance <- 1
} else {
## chance of a household being selected in each cluster
# number of households chosen in each cluster
n_households_chosen <- dplyr::summarise(
dplyr::group_by(x, {{cluster_x}}),
n_hh = sum(!duplicated({{household_x}}))
)
# join counts of households surveyed to list of clusters
cl <- dplyr::left_join(cl, n_households_chosen, by = setNames(clus_id_x, clus_id_cl))
# inverse chance of a household being chosen in each cluster
cl[["hh_chance"]] <- cl[[hh_id_cl]] / cl[["n_hh"]]
}
# multiply cluster chance and household chance
cl$clus_hh_chance <- cl$hh_chance * cluster_chance
## chance of an individual being selected in their household
# create a subset - so we dont merge too many vars in later
temp <- dplyr::select(x,
tidyselect::all_of(clus_id_x),
tidyselect::all_of(hh_id_x),
tidyselect::all_of(indiv_eli),
tidyselect::all_of(indiv_surv)
)
# inverse chance of individual being chosen in their household
temp[["indiv_chance"]] <- temp[[indiv_eli]] / temp[[indiv_surv]]
# add in the cluster_household chance
temp <- dplyr::left_join(temp, cl, by = setNames(clus_id_cl, clus_id_x))
# create final weight by multiplying indivdual chance by cluster_household chance
temp[[surv_weight]] <- temp$indiv_chance * temp$clus_hh_chance
# create a survey weight id by merging cluster and household id
temp[[surv_weight_ID]] <- paste0(temp[[clus_id_x]], "_", temp[[hh_id_x]])
temp[[clus_id_x]] <- NULL
temp[[hh_id_x]] <- NULL
# only keep the weight and the weight id
temp <- dplyr::select(temp, {{surv_weight}}, {{surv_weight_ID}})
# bind weights on to original dataset
d <- dplyr::bind_cols(x, temp)
# return new dataset with weights
d
}