/
sp_balance.R
281 lines (250 loc) · 10.9 KB
/
sp_balance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
###############################################################################
# Function: sp_balance (exported)
# Programmer: Michael Dumelle
# Date: December 03, 2020
# Last Revised: December 03, 2020
#
#' Calculate spatial balance metrics
#'
#' This function measures the spatial balance (with respect to the
#' sampling frame) of design sites using
#' Voronoi polygons (Dirichlet tessellations).
#'
#' @param object An \code{sf} object containing some design sites.
#'
#' @param sframe The sampling frame as an \code{sf} object. The coordinate
#' system for \code{sframe} must be one where distance for coordinates is meaningful.
#'
#' @param stratum_var The name of the stratum variable in \code{object}
#' and \code{sframe}. If \code{NULL} (the default), no strata is assumed.
#' If a single character vector is provided, it is assumed this is the
#' name of the stratum variable in \code{object} and \code{sframe}. If
#' a two-dimensional character vector is provided, one element must be
#' named "object" and corresponds to the name of the stratum variable
#' in \code{object}, while the other element must be named "sframe" and
#' corresponds to the name of the stratum variable in \code{sframe}.
#'
#' @param ip Inclusion probabilities associated with each row of \code{sframe}.
#' If these are not provided, an equal probability design is assumed (within
#' strata).
#'
#' @param metrics A character vector of spatial balance metrics:
#' \describe{
#' \item{\code{pielou}}{ Pielou's Evenness Index (the default). This statistic
#' can take on a value between zero and one.}
#' \item{\code{simpsons}}{ Simpsons Evenness Index. This statistic
#' can take on a value between zero and logarithm of the sample size.}
#' \item{\code{rmse}}{ Root-Mean-Squared Error. This statistic
#' can take on a value between zero and infinity.}
#' \item{\code{mse}}{ Mean-Squared Error. This statistic
#' can take on a value between zero and infinity.}
#' \item{\code{mae}}{ Median-Absolute Error. This statistic
#' can take on a value between zero and infinity.}
#' \item{\code{medae}}{ Mean-Absolute Error. This statistic
#' can take on a value between zero and infinity.}
#' \item{\code{chisq}}{ Chi-Squared Loss. This statistic
#' can take on a value between zero and infinity.}
#' }
#' All spatial balance metrics have a lower bound of zero, which indicates perfect
#' spatial balance. As the metric value increases, the spatial balance decreases.
#'
#' @param extents Should the extent (total units) within each Voronoi polygon
#' be returned? Defaults to \code{FALSE}.
#'
#' @return A data frame with columns providing the stratum (\code{stratum}),
#' spatial balance metric (\code{metric}), and spatial balance (\code{value}).
#'
#' @importFrom stats median
#'
#' @author Michael Dumelle \email{Dumelle.Michael@@epa.gov}
#'
#' @export
#'
#' @examples
#' \dontrun{
#' sample <- grts(NE_Lakes, 30)
#' sp_balance(sample$sites_base, NE_Lakes)
#' strata_n <- c(low = 25, high = 30)
#' sample_strat <- grts(NE_Lakes, n_base = strata_n, stratum_var = "ELEV_CAT")
#' sp_balance(sample_strat$sites_base, NE_Lakes, stratum_var = "ELEV_CAT", metric = "rmse")
#' }
sp_balance <- function(object, sframe, stratum_var = NULL, ip = NULL, metrics = "pielou", extents = FALSE) {
if (inherits(object, "spdesign")) {
stop("object must be an sf object. If object is output from grts() or irs(), instead 1) use object$sites_legacy, object$sites_base, object$sites_over, or object$sites_near; or 2) use sp_rbind().")
}
# find system info
on_solaris <- Sys.info()[["sysname"]] == "SunOS"
if (on_solaris) {
stop("sp_balance() is not supported on Solaris.")
}
if (is.null(stratum_var)) {
object$stratum_var <- "None"
sframe$stratum_var <- "None"
stratum_var <- "stratum_var"
}
if (length(stratum_var) == 1) {
stratum_var <- c(object = stratum_var, sframe = stratum_var)
} else if (length(stratum_var) == 2) {
if (!all(names(stratum_var) %in% c("object", "sframe"))) {
stop("names of stratum_var must be object and sframe")
}
} else {
stop("length of stratum_var must be zero, one, or two")
}
object_levels <- sort(unique(object[[stratum_var["object"]]]))
sframe_levels <- sort(unique(sframe[[stratum_var["sframe"]]]))
if (!all.equal(object_levels, sframe_levels)) {
stop("stratum_var levels for object and sframe must be the same")
}
object_split <- split(object, object[[stratum_var["object"]]])
sframe_split <- split(sframe, sframe[[stratum_var["sframe"]]])
output <- lapply(object_levels, function(x) calculate_sp_balance(object_split[[x]], sframe_split[[x]], ip, metrics, extents))
names(output) <- object_levels
metrics_dfs <- lapply(object_levels, function(x) cbind(stratum = x, data.frame(metric = metrics, value = output[[x]]$values)))
metrics <- do.call("rbind", metrics_dfs)
metrics <- metrics[order(metrics$metric), , drop = FALSE]
row.names(metrics) <- NULL
if (extents) {
extent_sf_split <- lapply(names(output), function(x) {
object_split[[x]]$extent <- output[[x]]$extent
object_split[[x]][c(stratum_var["object"], "extent")]
})
extents <- do.call("rbind", extent_sf_split)
names(extents)[[which(names(extents) == stratum_var["object"])]] <- "stratum"
return(list(metrics = metrics, extents = extents))
} else {
return(metrics)
}
}
calculate_sp_balance <- function(object_split, sframe_split, ip, metrics, extents) {
# need to calculate the density of each row in sframe_split
if (all(st_geometry_type(sframe_split) %in% c("POINT", "MULTIPOINT"))) {
sframe_split$dens <- 1
} else if (all(st_geometry_type(sframe_split) %in% c("LINESTRING", "MULTILINESTRING"))) {
sframe_split$dens <- as.numeric(st_length(sframe_split))
} else if (all(st_geometry_type(sframe_split) %in% c("POLYGON", "MULTIPOLYGON"))) {
sframe_split$dens <- as.numeric(st_area(sframe_split))
}
# these inclusion probabilities must be provided for the entire sampling frame
if (is.null(ip)) {
sframe_split$ip <- nrow(object_split) * (sframe_split$dens / sum(sframe_split$dens))
} else {
sframe_split$ip <- sframe_split[[ip]]
}
# finding the sframe bounding box, this needs to be reordered to
# use in the Dirichlet Tesselation
pop_bbox <- st_bbox(sframe_split)[c("xmin", "xmax", "ymin", "ymax")]
# working with the object sites
## take the object sites coordinates
samp_coords <- st_coordinates(object_split)
## name them X and Y
colnames(samp_coords) <- c("X", "Y")
## isolate X
samp_xcoord <- samp_coords[, "X"]
## isolate Y
samp_ycoord <- samp_coords[, "Y"]
## recover the object size
n <- nrow(object_split)
# spatial balance with respect to the sframe
tiles <- tile.list(deldir(x = samp_xcoord, y = samp_ycoord, rw = pop_bbox))
## using lapply instead of a loop
sftess <- lapply(tiles, get_sftess)
sftess <- st_sfc(sftess, crs = st_crs(sframe_split))
sftess <- st_sf(poly = 1:n, geometry = sftess)
# next part required because st_intersection won't include points if they are on the
# exact boundary of a polygon (Voronoi polygon has this occur when a site is along the
# bounding box)
buffer_dist <- (pop_bbox["xmax"] - pop_bbox["xmin"]) * (pop_bbox["ymax"] - pop_bbox["ymin"]) / 1e12
sftess <- suppressWarnings(st_intersection(st_make_valid(sframe_split), st_buffer(sftess, buffer_dist)))
# scaling the inclusion probabilities by the amount of the geometry
# in the respective tesselation -- polydens can be at most one -- if the
# row of sframe is contained fully in the respective row of sframe
# polydens can be less than one -- when the row of sframe is not contained
# fully in the respective row of sframe
# for points, sframe geometry must be point or multipoint
if (all(st_geometry_type(sframe_split) %in% c("POINT", "MULTIPOINT"))) {
## storing a dummy variable to index counts by
sftess$polydens <- 1
sftess$adjip <- sftess$ip
} else if (all(st_geometry_type(sframe_split) %in% c("LINESTRING", "MULTILINESTRING"))) {
sftess$polydens <- as.numeric(st_length(sftess))
sftess$adjip <- sftess$ip * sftess$polydens / sftess$dens
} else if (all(st_geometry_type(sframe_split) %in% c("POLYGON", "MULTIPOLYGON"))) {
sftess$polydens <- as.numeric(st_area(sftess))
sftess$adjip <- sftess$ip * sftess$polydens / sftess$dens
}
## summing over each polygon
propextent <- with(sftess, tapply(adjip, poly, sum))
## setting NA's equal to zero
propextent[is.na(propextent)] <- 0
# making proportions and expected quantities
proportions <- propextent / sum(propextent) # which equals n
expected_proportions <- 1 / n
# metrics
values <- vapply(metrics, calculate_metric, double(1), proportions, expected_proportions)
# returning the results when stored by the user
output <- list(values = values)
if (extents) {
output$extent <- with(sftess, tapply(polydens, poly, sum))
}
output
}
get_sftess <- function(tile) {
## finding the number of points in the bounding polygon
npol <- length(tile$x)
## creating and returning the appropriate polygon after binding coords
st_polygon(list(cbind(
c(tile$x[1], tile$x[npol:1]),
c(tile$y[1], tile$y[npol:1])
)))
}
calculate_metric <- function(metric, proportions, expected_proportions) {
switch(metric,
pielou = calculate_pielou(proportions, expected_proportions),
simpsons = calculate_simpsons(proportions, expected_proportions),
rmse = calculate_rmse(proportions, expected_proportions),
mse = calculate_mse(proportions, expected_proportions),
mae = calculate_mae(proportions, expected_proportions),
medae = calculate_medae(proportions, expected_proportions),
chisq = calculate_chisq(proportions, expected_proportions),
stop("an invalid metric was provided")
)
}
calculate_pielou <- function(proportions, expected_proportions) {
pielou <- 1 + sum(proportions * log(proportions)) / log(1 / expected_proportions) # 1/E(p) = n$
names(pielou) <- "pielou"
pielou
}
calculate_simpsons <- function(proportions, expected_proportions) {
simpsons <- sum(proportions^2) - expected_proportions
names(simpsons) <- "simpsons"
simpsons
}
calculate_rmse <- function(proportions, expected_proportions) {
sqr_dev <- (proportions - expected_proportions)^2
mse <- sum(sqr_dev) / length(sqr_dev)
rmse <- sqrt(mse)
names(rmse) <- "rmse"
rmse
}
calculate_mse <- function(proportions, expected_proportions) {
sqr_dev <- (proportions - expected_proportions)^2
mse <- sum(sqr_dev) / length(sqr_dev)
names(mse) <- "mse"
mse
}
calculate_mae <- function(proportions, expected_proportions) {
mae <- mean(abs(proportions - expected_proportions) / expected_proportions)
names(mae) <- "mae"
mae
}
calculate_medae <- function(proportions, expected_proportions) {
medae <- median(abs(proportions - expected_proportions) / expected_proportions)
names(medae) <- "medae"
medae
}
calculate_chisq <- function(proportions, expected_proportions) {
chisq <- sum((proportions - expected_proportions)^2 / expected_proportions)
names(chisq) <- "chisq"
chisq
}