-
Notifications
You must be signed in to change notification settings - Fork 4
/
calculate_adi.R
663 lines (591 loc) · 29.5 KB
/
calculate_adi.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
#' Calculate ADI and ADI-3 from census data.
#'
#' Calculate the Area Deprivation Index and Berg Indices (ADI-3) using decennial
#' US census or American Community Survey (ACS) variables.
#'
#' The function [get_adi()] calls this function by default as its final step,
#' but some users may want to calculate ADI and ADI-3 values for different
#' combinations of areas in a given data set. [`get_adi`]`(raw_data_only =
#' TRUE)` returns the raw census data used to calculate ADI and ADI-3. Users may
#' select subsets of such a data set and pipe them into `calculate_adi()`.
#'
#' This function discerns what kind of census data that `data` contains (ACS, or
#' one of the decennial censuses) by checking for the existence of key variables
#' unique to each kind of data set.
#'
#' Areas listed as having zero households are excluded from ADI and ADI-3
#' calculation. Their resulting ADIs and ADI-3s will be `NA`.
#'
#' If calling this function directly (i.e., not via [get_adi()]) on a data set
#' that contains median household income (B19013_001) and does not contain
#' median family income (B19113_001), median household income will be used in
#' place of median family income, with a `warning()`. See the "Missingness and
#' imputation" section of [get_adi()].
#'
#' @seealso For more information, see [get_adi()], especially the sections
#' titled **ADI and ADI-3 factor loadings** and **Missingness and
#' imputation**.
#'
#' @param data_raw A data frame, [`tibble`][tibble::tibble], or [`sf`][sf::sf]
#' object ultimately obtained via [tidycensus::get_acs()] or
#' [tidycensus::get_decennial()], having the data necessary to compute the
#' indicators of the ADI and ADI-3.
#'
#' The columns of his data frame must be named according to the elements of
#' the `variable` column in [`acs_vars`] and/or [`decennial_vars`].
#'
#' The easiest way to obtain data like this is to run
#' [`get_adi`]`(raw_data_only = TRUE)`.
#' @param keep_indicators Logical indicating whether or not to keep the
#' component indicators of the ADI and ADI-3 as well as the original census
#' variables used to calculate them. Defaults to `FALSE`.
#'
#' See [`acs_vars`] and [`decennial_vars`] for basic descriptions of the raw
#' census variables.
#'
#' @param seed Passed to the `seed` argument of [mice::mice()] when imputation
#' is needed.
#'
#' @return A [`tibble`][tibble::tibble] (or [`sf`][sf::sf]) with the same number
#' of rows as `data`. Columns include `GEOID`, `NAME`, `ADI`, `Financial
#' Strength`, `Economic_Hardship_and_Inequality`, and
#' `Educational_Attainment`. Further columns containing the indicators and raw
#' values will also be present if `keep_indicators = TRUE`.
#'
#' @examples
#' \dontrun{
#' # Wrapped in \dontrun{} because these examples require a Census API key.
#'
#' raw_census <- get_adi("state", year = 2017, raw_data_only = TRUE)
#'
#' calculate_adi(raw_census)
#'
#' calculate_adi(raw_census, keep_indicators = TRUE)
#' }
#' @importFrom rlang .data
#' @export
calculate_adi <- function(data_raw, keep_indicators = FALSE, seed = NA) {
if (!is.data.frame(data_raw)) {
stop(
"data must be a tibble, sf tibble, or data-frame-like object",
.call = FALSE
)
}
# Returns the name of the column in data_raw that contains data on total
# number of households per area.
total_hh_colname <- get_total_hh_colname(data_raw)
# Logical vector indicating whether or not each area has a nonzero number of
# households.
nonzero_hh_lgl <- data_raw[[total_hh_colname]] != 0
# Calculate the 15 ADI indicators from the raw census data.
indicators <- calculate_indicators(data_raw)
# Filter the data to only include areas with more than zero households.
indicators_hh_only <- indicators %>% dplyr::filter(!!nonzero_hh_lgl)
if (nrow(indicators_hh_only) < 30L) {
warning(
"\nCalculating ADI and ADI-3 values from fewer than 30 locations.",
"\nIt is recommended to add more in order to obtain trustworthy results."
)
}
# Performs single imputation if there is any missingness in the data.
if (anyNA(indicators_hh_only)) {
indicators_hh_only <-
tryCatch(
indicators_hh_only %>%
mice::mice(
m = 1L,
maxit = 50L,
method = "pmm",
seed = seed,
printFlag = FALSE
) %>%
mice::complete(),
error = function(e) {
rlang::abort(
paste0(
"Imputation unsuccessful. ",
"Neither ADIs nor ADI-3s were calculated.",
"\n\nRun rlang::last_error()$adi_indicators to access the ",
"indicator data",
"\nwhose missingness could not be imputed. These data exclude ",
"areas with zero households.",
"\n\nRun rlang::last_error()$adi_raw_data to access the raw ",
"census data,\n",
"which includes areas with zero households (identified by the ",
'column named "', total_hh_colname, '").'
),
class = "imputation_unsuccessful",
adi_indicators =
# 15 ADI indicators data, accessible via
# rlang::last_error()$adi_indicators
data_raw %>%
dplyr::as_tibble() %>%
dplyr::select("GEOID", dplyr::starts_with("NAME")) %>%
dplyr::filter(!!nonzero_hh_lgl) %>%
dplyr::bind_cols(indicators_hh_only),
adi_raw_data =
# Raw census data, which includes the areas with zero households,
# accessible via rlang::last_error()$adi_raw_data
data_raw %>%
dplyr::select(
"GEOID",
dplyr::starts_with("NAME"),
!!total_hh_colname,
dplyr::everything()
)
)
}
)
indicators[nonzero_hh_lgl, ] <- indicators_hh_only
message("\nSingle imputation performed")
}
# Iterate over four different sets of variables to create four different
# indices. The first is the traditional ADI, the rest are the traditional ADI
# indicators split into three meaningful categories: the Berg Indices, or
# ADI-3.
adi <-
purrr::map_dfc(
list(
ADI =
unlist(
rlang::list2(
# This is so we grab the correct one of
# medianFamilyIncome/medianHouseholdIncome, which we made sure
# would be the first column name in indicators_hh_only.
!!names(indicators_hh_only)[1L] := -1,
"medianMortgage" = -1,
"medianRent" = -1,
"medianHouseValue" = -1,
"pctFamiliesInPoverty" = +1,
"pctOwnerOccupiedHousing" = -1,
"ratioThoseMakingUnder10kToThoseMakingOver50k" = +1,
"pctPeopleLivingBelow150PctFederalPovertyLevel" = +1,
"pctHouseholdsWithChildrenThatAreSingleParent" = +1,
"pctHouseholdsWithNoVehicle" = +1,
"pctPeopleWithWhiteCollarJobs" = -1,
"pctPeopleUnemployed" = +1,
"pctPeopleWithAtLeastHSEducation" = -1,
"pctPeopleWithLessThan9thGradeEducation" = +1,
"pctHouseholdsWithOverOnePersonPerRoom" = +1
)
),
Financial_Strength =
unlist(
rlang::list2(
# This is so we grab the correct one of
# medianFamilyIncome/medianHouseholdIncome, which we made sure
# would be the first column name in indicators_hh_only.
!!names(indicators_hh_only)[1L] := +1,
"medianMortgage" = +1,
"medianRent" = +1,
"medianHouseValue" = +1,
"pctPeopleWithWhiteCollarJobs" = +1
)
),
Economic_Hardship_and_Inequality =
c("pctFamiliesInPoverty" = +1,
"pctOwnerOccupiedHousing" = -1,
"ratioThoseMakingUnder10kToThoseMakingOver50k" = +1,
"pctPeopleLivingBelow150PctFederalPovertyLevel" = +1,
"pctHouseholdsWithChildrenThatAreSingleParent" = +1,
"pctHouseholdsWithNoVehicle" = +1,
"pctPeopleUnemployed" = +1),
Educational_Attainment =
c("pctPeopleWithAtLeastHSEducation" = +1,
"pctPeopleWithLessThan9thGradeEducation" = -1,
"pctHouseholdsWithOverOnePersonPerRoom" = -1)
),
calc_adi_col,
result_vec = rep_len(NA_real_, length.out = length(nonzero_hh_lgl)),
indicators_hh_only = indicators_hh_only,
nonzero_hh_lgl = nonzero_hh_lgl
)
out <-
if (keep_indicators) {
dplyr::select(
dplyr::bind_cols(data_raw, adi, indicators),
1L,
2L,
!!colnames(adi),
!!colnames(indicators),
!!total_hh_colname,
dplyr::everything()
)
} else dplyr::bind_cols(data_raw[1L:2L], adi)
attr(out, "loadings") <- attr(out$ADI, "loadings")
class(out) <- c("adi", class(out))
out
}
calc_adi_col <- function(expected_signs,
result_vec,
indicators_hh_only,
nonzero_hh_lgl) {
# Principal-components analysis (PCA) of the statistics that produces
# the raw ADI and ADI-3 scores
fit <- psych::principal(indicators_hh_only[names(expected_signs)])
# Sometimes the PCA produces results that are completely reversed (i.e.,
# it gives deprived areas low ADIs and less deprived areas high ADIs).
# A check is performed below to see if this has occurred.
# 1. The signage of the factor loadings are multiplied by their expected
# signage according to Singh's original research (present in the unnamed
# vector of 1s and -1s below). This produces a vector of 1s and -1s,
# with a 1 indicating a factor loading in the expected direction and a
# -1 indicating a factor loading in the wrong direction.
# 2. The sum() of this vector is computed.
# 3. The sign() of this sum is computed and saved into a variable called
# "signage_flipper". It will equal 1 or -1. It will equal 1 if most of
# the factor loadings have the same sign as the original Singh factor
# loadings. It will be -1 if not. It will never equal 0 because there is
# an odd number of factor loadings.
signage_flipper <- sign(sum(sign(fit$loadings) * expected_signs))
# 4. The variable signage_flipper is multiplied by the PCA scores
# before standardization. In effect, this flips the ADIs in the right
# direction (multiplies their scores by -1) if they were reversed, and
# it keeps them the same (multiplies their scores by 1) if they were
# not reversed.
# The raw ADI scores are standardized to have a mean of 100 and SD of 20
result_vec[nonzero_hh_lgl] <-
as.numeric(fit$scores * signage_flipper * 20 + 100)
# We also want the loadings tables for each of the three factors
attr(result_vec, "loadings") <-
dplyr::tibble(
factor = row.names(fit$loadings),
loading = as.double(fit$loadings)
)
result_vec
}
get_total_hh_colname <- function(data_raw) {
# Returns the name of the column that contains data on the total number of
# households per area.
total_hh_colname <-
intersect(
x = colnames(data_raw),
y = c("P018001", "P015001", "P0030001", "B11005_001")
)
if (length(total_hh_colname) != 1L) {
stop(
"As of sociome 1.1.0, the data set must have exactly one of the",
"\nfollowing columns, so that zero-houshold areas may be removed:",
"\n\nP018001, P015001, P0030001, B11005_001"
)
}
total_hh_colname
}
calculate_indicators <- function(data) {
# Calculates the 15 ADI indicators, first checking to see what kind of data
# are in the data set to see if it's ACS data, 2000 decennial census data,
# etc.
colnames <- colnames(data)
indicators <-
if (any(colnames == "B17010_001")) {
factors_from_acs(data, colnames)
} else if (any(colnames == "P077001")) {
factors_from_2000_decennial(data)
} else if (any(colnames == "P107A001")) {
factors_from_1990_decennial(data)
} else {
stop("Data missing at least one variable necessary to calculate ADI.")
}
# Changes non-finite values (including NaN) to NA.
indicators[!is.finite(as.matrix.data.frame(indicators))] <- NA
indicators
}
# Selects the relevant variables from the tidycensus::get_acs() output, then
# wrangles them into a data frame that contains the specific measures that are
# used to calculate ADI and ADI-3
#' @importFrom rlang .data
factors_from_acs <- function(data_raw, colnames) {
data_indicators <- as.data.frame(data_raw)
# In case data is an sf tibble, this causes the geometry column to become
# "unsticky", allowing it to be removed by the subsequent dplyr::select()
# command so that it doesn't interfere with the imputation that may follow.
# Before 2010, C24010_040 contained the "civilian female age 16+ in
# white-collar occupations" data, but starting in 2010 C24010_039 contained
# these data. If C24010_040 is in data_raw and C24010_039 is not, the former
# is renamed to C24010_039. If they are both in the data a warning() is
# thrown, C24010_040 is ignored, and only C24010_039 will be used. Since
# get_adi() never passes both of these variables to calculate_adi(), this will
# only be a problem if the user uses calculate_adi() manually.
if (any(colnames == "C24010_040")) {
if (any(colnames == "C24010_039")) {
warning(
"\nThe variables C24010_039 and C24010_040 are both present.",
'\nC24010_039 will be used for "civilian females age 16+ in',
'\nwhite-collar occupations", which is incorrect for pre-2010 data.',
"\nIf seeking pre-2010 estimates, remove C24010_039 from dataset.",
call. = FALSE,
immediate. = TRUE
)
} else {
data_indicators <- data_indicators %>%
dplyr::rename("C24010_039" = "C24010_040")
}
}
# B19113_001 (median family income) is unavailable for at the block group
# level for 2015 or 2016 data, so B19013_001 (median household income) is used
# instead. If B19013_001 is in the data and B19113_001 is not, B19013_001 will
# be renamed to B19113_001, and a warning will be thrown if get_adi()'s
# version of the warning was not detected in "warnings()". The special
# variable median_income_name is created so that the eventual output properly
# identifies which kind of median income was used.
if (any(colnames == "B19013_001") && !any(colnames == "B19113_001")) {
if (!any(grepl("B19113_001", names(warnings())))) {
warning(
"\nMedian household income (B19013_001) is being used in place of ",
"\nmedian family income (B19113_001).",
"\n\nThis is desirable only for 2015 or 2016 block group-level data.",
'\n\nSee the "Missingness and imputation" section of ?get_adi, ',
"as well as:",
"\nhttps://www.census.gov/programs-surveys/acs/",
"technical-documentation/user-notes/2016-01.html",
call. = FALSE,
immediate. = TRUE
)
}
data_indicators <- data_indicators %>%
dplyr::rename("B19113_001" = "B19013_001")
median_income_name <- "medianHouseholdIncome"
} else {
median_income_name <- "medianFamilyIncome"
}
# The absence of B25003_001 (occupied housing units) is assumed to indicate
# that ADI and ADI-3 calculated using 2010 decennial census data was
# requested. This request results in mostly 2010 5-year ACS estimates being
# used, except for the handful of 2010 decennial census variables that are
# applicable to ADI and ADI-3 indicator calculation. This is handled by simply
# renaming the decennial census variables to their ACS counterparts.
if (!any(colnames == "B25003_001")) {
data_indicators <- data_indicators %>%
dplyr::rename(
"B25003_001" = "H003002",
"B25003_002" = "H014002",
"B11005_002" = "P020002",
"B11005_005" = "P020008"
)
}
# Below, the presence or absence of various census variables are used to
# discern which year of data is being used, and the calculation of the
# indicators is adjusted accordingly.
if (!any(colnames == "B23025_005")) {
data_indicators <- data_indicators %>%
dplyr::mutate(
B23025_005 = .data$B23001_008 + .data$B23001_015 +
.data$B23001_022 + .data$B23001_029 + .data$B23001_036 +
.data$B23001_043 + .data$B23001_050 + .data$B23001_057 +
.data$B23001_064 + .data$B23001_071 + .data$B23001_076 +
.data$B23001_081 + .data$B23001_086 + .data$B23001_094 +
.data$B23001_101 + .data$B23001_108 + .data$B23001_115 +
.data$B23001_122 + .data$B23001_129 + .data$B23001_136 +
.data$B23001_143 + .data$B23001_150 + .data$B23001_157 +
.data$B23001_162 + .data$B23001_167 + .data$B23001_172,
B23025_003 = .data$B23001_006 + .data$B23001_013 +
.data$B23001_020 + .data$B23001_027 + .data$B23001_034 +
.data$B23001_041 + .data$B23001_048 + .data$B23001_055 +
.data$B23001_062 + .data$B23001_069 + .data$B23001_074 +
.data$B23001_079 + .data$B23001_084 + .data$B23001_092 +
.data$B23001_099 + .data$B23001_106 + .data$B23001_113 +
.data$B23001_120 + .data$B23001_127 + .data$B23001_134 +
.data$B23001_141 + .data$B23001_148 + .data$B23001_155 +
.data$B23001_160 + .data$B23001_165 + .data$B23001_170
)
}
if (any(colnames == "B15002_001")) {
data_indicators <- data_indicators %>%
dplyr::mutate(
Nless9thgrade = .data$B15002_003 + .data$B15002_020 +
.data$B15002_004 + .data$B15002_021 + .data$B15002_005 +
.data$B15002_022 + .data$B15002_006 + .data$B15002_023,
Nhighschoolup = .data$B15002_011 + .data$B15002_028 +
.data$B15002_012 + .data$B15002_029 + .data$B15002_013 +
.data$B15002_030 + .data$B15002_014 + .data$B15002_031 +
.data$B15002_015 + .data$B15002_032 + .data$B15002_016 +
.data$B15002_033 + .data$B15002_017 + .data$B15002_034 +
.data$B15002_018 + .data$B15002_035,
B15003_001 = .data$B15002_001
)
} else {
data_indicators <- data_indicators %>%
dplyr::mutate(
Nless9thgrade = .data$B15003_002 + .data$B15003_003 +
.data$B15003_004 + .data$B15003_005 + .data$B15003_006 +
.data$B15003_007 + .data$B15003_008 + .data$B15003_009 +
.data$B15003_010 + .data$B15003_011 + .data$B15003_012,
Nhighschoolup = .data$B15003_017 + .data$B15003_018 +
.data$B15003_019 + .data$B15003_020 + .data$B15003_021 +
.data$B15003_022 + .data$B15003_023 + .data$B15003_024 +
.data$B15003_025
)
}
data_indicators %>%
dplyr::mutate(
Fpoverty = .data$B17010_002 / .data$B17010_001,
OwnerOcc = .data$B25003_002 / .data$B25003_001,
incomegreater50 = .data$B19001_011 + .data$B19001_012 +
.data$B19001_013 + .data$B19001_014 + .data$B19001_015 +
.data$B19001_016 + .data$B19001_017,
IncomeDisparity = log(100 * (.data$B19001_002 / .data$incomegreater50)),
less150poverty = .data$C17002_002 + .data$C17002_003 +
.data$C17002_004 + .data$C17002_005,
less150FPL = .data$less150poverty / .data$C17002_001,
singlePchildren = .data$B11005_005 / .data$B11005_002,
novehiclesum = .data$B25044_003 + .data$B25044_010,
pnovehicle = .data$novehiclesum / .data$B25044_001,
sumprofs = .data$C24010_003 + .data$C24010_039,
whitecollar = .data$sumprofs / .data$C24010_001,
unemployedPct = .data$B23025_005 / .data$B23025_003,
Phighschoolup = .data$Nhighschoolup / .data$B15003_001,
Pless9grade = .data$Nless9thgrade / .data$B15003_001,
SUMcrowded = .data$B25014_005 + .data$B25014_006 +
.data$B25014_007 + .data$B25014_011 + .data$B25014_012 +
.data$B25014_013,
Pcrowded = .data$SUMcrowded / .data$B25014_001
) %>%
dplyr::select(
# Make sure to keep median_income_name as the first one so that the
# medianFamilyIncome/medianHouseholdIncome check can be properly performed
# later on.
!!median_income_name := "B19113_001",
"medianMortgage" = "B25088_002",
"medianRent" = "B25064_001",
"medianHouseValue" = "B25077_001" ,
"pctFamiliesInPoverty" = "Fpoverty",
"pctOwnerOccupiedHousing" = "OwnerOcc",
"ratioThoseMakingUnder10kToThoseMakingOver50k" = "IncomeDisparity",
"pctPeopleLivingBelow150PctFederalPovertyLevel" = "less150FPL",
"pctHouseholdsWithChildrenThatAreSingleParent" = "singlePchildren",
"pctHouseholdsWithNoVehicle" = "pnovehicle",
"pctPeopleWithWhiteCollarJobs" = "whitecollar",
"pctPeopleUnemployed" = "unemployedPct",
"pctPeopleWithAtLeastHSEducation" = "Phighschoolup",
"pctPeopleWithLessThan9thGradeEducation" = "Pless9grade",
"pctHouseholdsWithOverOnePersonPerRoom" = "Pcrowded"
)
}
#' @importFrom rlang .data
factors_from_2000_decennial <- function(data_raw) {
# Selects the relevant variables from the tidycensus::get_decennial() output,
# then wrangles them into a data frame that contains the specific measures
# that are used to calculate ADI and ADI-3
data_raw %>%
as.data.frame() %>%
# In case data is an sf tibble, this causes the geometry column to become
# "unsticky", allowing it to be removed by the subsequent dplyr::select()
# command so that it doesn't interfere with the imputation that may follow.
dplyr::mutate(
Fpoverty = .data$P090002 / .data$P090001,
OwnerOcc = .data$H004002 / .data$H004001,
incomegreater50 = .data$P052011 + .data$P052012 + .data$P052013 +
.data$P052014 + .data$P052015 + .data$P052016 + .data$P052017,
IncomeDisparity = log(100 * (.data$P052002 / .data$incomegreater50)),
less150poverty = .data$P088002 + .data$P088003 + .data$P088004 +
.data$P088005 + .data$P088006,
less150FPL = .data$less150poverty / .data$P088001,
singlePchildren = .data$P019005 / .data$P019002,
novehiclesum = .data$H044003 + .data$H044010,
pnovehicle = .data$novehiclesum / .data$H044001,
sumprofs = .data$P050003 + .data$P050050,
whitecollar = .data$sumprofs / .data$P050001,
unemployedLabor = .data$P043007 + .data$P043014,
allLabor = .data$P043005 + .data$P043012,
unemployedPct = .data$unemployedLabor / .data$allLabor,
Nhighschoolup = .data$P037011 + .data$P037028 + .data$P037012 +
.data$P037029 + .data$P037013 + .data$P037030 + .data$P037014 +
.data$P037031 + .data$P037015 + .data$P037032 + .data$P037016 +
.data$P037033 + .data$P037017 + .data$P037034 + .data$P037018 +
.data$P037035,
Phighschoolup = .data$Nhighschoolup / .data$P037001,
Nless9thgrade = .data$P037003 + .data$P037020 + .data$P037004 +
.data$P037021 + .data$P037005 + .data$P037022 +
.data$P037006 + .data$P037023,
Pless9grade = .data$Nless9thgrade / .data$P037001,
SUMcrowded = .data$H020005 + .data$H020006 + .data$H020007 +
.data$H020011 + .data$H020012 + .data$H020013,
Pcrowded = .data$SUMcrowded / .data$H020001
) %>%
dplyr::select(
# Make sure to keep "medianFamilyIncome" as the first one so that the
# medianFamilyIncome/medianHouseholdIncome check can be properly performed
# later on.
"medianFamilyIncome" = "P077001",
"medianMortgage" = "H091001",
"medianRent" = "H063001",
"medianHouseValue" = "H085001" ,
"pctFamiliesInPoverty" = "Fpoverty",
"pctOwnerOccupiedHousing" = "OwnerOcc",
"ratioThoseMakingUnder10kToThoseMakingOver50k" = "IncomeDisparity",
"pctPeopleLivingBelow150PctFederalPovertyLevel" = "less150FPL",
"pctHouseholdsWithChildrenThatAreSingleParent" = "singlePchildren",
"pctHouseholdsWithNoVehicle" = "pnovehicle",
"pctPeopleWithWhiteCollarJobs" = "whitecollar",
"pctPeopleUnemployed" = "unemployedPct",
"pctPeopleWithAtLeastHSEducation" = "Phighschoolup",
"pctPeopleWithLessThan9thGradeEducation" = "Pless9grade",
"pctHouseholdsWithOverOnePersonPerRoom" = "Pcrowded"
)
}
#' @importFrom rlang .data
factors_from_1990_decennial <- function(data_raw) {
data_raw %>%
as.data.frame() %>%
# In case data is an sf tibble, this causes the geometry column to become
# "unsticky", allowing it to be removed by the subsequent dplyr::select()
# command so that it doesn't interfere with the imputation that may follow.
dplyr::mutate(
familybelowpoverty = .data$P1230013 + .data$P1230014 + .data$P1230015 +
.data$P1230016 + .data$P1230017 + .data$P1230018 + .data$P1230019 +
.data$P1230020 + .data$P1230021 + .data$P1230022 + .data$P1230023 +
.data$P1230024,
Fpoverty = .data$familybelowpoverty / .data$P0040001,
OwnerOcc = .data$H0030001 / .data$H0020001,
incomeunder10 = .data$P0800001 + .data$P0800002,
incomeover50 = .data$P0800019 + .data$P0800020 + .data$P0800021 +
.data$P0800022 + .data$P0800023 + .data$P0800024 + .data$P0800025,
IncomeDisparity = log(100 * .data$incomeunder10 / .data$incomeover50),
less150poverty = .data$P1210001 + .data$P1210002 + .data$P1210003 +
.data$P1210004 + .data$P1210005,
personspovertydetermined = .data$less150poverty + .data$P1210006 +
.data$P1210007 + .data$P1210008 + .data$P1210009,
less150FPL = .data$less150poverty / .data$personspovertydetermined,
oneparent = .data$P0180002 + .data$P0180003,
allchildren = .data$oneparent + .data$P0180001 + .data$P0180004 +
.data$P0180005,
singlePchildren = .data$oneparent / .data$allchildren,
novehiclesum = .data$H0410001 + .data$H0410003,
vehiclesdetermined = .data$novehiclesum + .data$H0410002 + .data$H0410004,
pnovehicle = .data$novehiclesum / .data$vehiclesdetermined,
sumprofs = .data$P0780001 + .data$P0780002,
allcivilianemployed = .data$P0700002 + .data$P0700006,
whitecollar = .data$sumprofs / .data$allcivilianemployed,
unemployedlabor = .data$P0700003 + .data$P0700007,
allLabor = .data$unemployedlabor + .data$allcivilianemployed,
unemployedPct = .data$unemployedlabor / .data$allLabor,
Nhighschoolup = .data$P0570003 + .data$P0570004 + .data$P0570005 +
.data$P0570006 + .data$P0570007,
people25andover = .data$Nhighschoolup + .data$P0570001 + .data$P0570002,
Phighschoolup = .data$Nhighschoolup / .data$people25andover,
Pless9grade = .data$P0570001 / .data$people25andover,
SUMcrowded = .data$H0210003 + .data$H0210004 + .data$H0210005,
crowdingdetermined = .data$SUMcrowded + .data$H0210001 + .data$H0210002,
Pcrowded = .data$SUMcrowded / .data$crowdingdetermined
) %>%
dplyr::select(
# Make sure to keep "medianFamilyIncome" as the first one so that the
# medianFamilyIncome/medianHouseholdIncome check can be properly performed
# later on.
"medianFamilyIncome" = "P107A001",
"medianMortgage" = "H052A001",
"medianRent" = "H043A001",
"medianHouseValue" = "H023B001",
"pctFamiliesInPoverty" = "Fpoverty",
"pctOwnerOccupiedHousing" = "OwnerOcc",
"ratioThoseMakingUnder10kToThoseMakingOver50k" = "IncomeDisparity",
"pctPeopleLivingBelow150PctFederalPovertyLevel" = "less150FPL",
"pctHouseholdsWithChildrenThatAreSingleParent" = "singlePchildren",
"pctHouseholdsWithNoVehicle" = "pnovehicle",
"pctPeopleWithWhiteCollarJobs" = "whitecollar",
"pctPeopleUnemployed" = "unemployedPct",
"pctPeopleWithAtLeastHSEducation" = "Phighschoolup",
"pctPeopleWithLessThan9thGradeEducation" = "Pless9grade",
"pctHouseholdsWithOverOnePersonPerRoom" = "Pcrowded"
)
}