/
calculate-gfr.R
169 lines (147 loc) · 7.33 KB
/
calculate-gfr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run. This is not called by knitr, because it's above the first chunk.
# ---- load-sources ------------------------------------------------------------
# Call `base::source()` on any repo file that defines functions needed below. Ideally, no real operations are performed.
# ---- load-packages -----------------------------------------------------------
# Attach these packages so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
# Import only certain functions of a package into the search path.
# import::from("magrittr", "%>%")
# Verify these packages are available on the machine, but their functions need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
requireNamespace("readr" )
# requireNamespace("tidyr" )
requireNamespace("dplyr" )
requireNamespace("testit" )
requireNamespace("checkmate" ) # For asserting conditions meet expected patterns/conditions. # remotes::install_github("mllg/checkmate")
# requireNamespace("OuhscMunge" ) # remotes::install_github(repo="OuhscBbmc/OuhscMunge")
# ---- declare-globals ---------------------------------------------------------
input_path_birth_count_county_month <- "datasets/raw/birth-count-county.csv"
input_path_census_county_month <- "datasets/derived/census-county-month.csv"
output_path_birth_count_county_month_csv_2014 <- "datasets/derived/county-month-birth-rate-2014-version.csv"
output_path_birth_count_county_month_csv_2005 <- "datasets/derived/county-month-birth-rate-2005-version.csv"
output_path_birth_count_county_month_rda_2014 <- "data/county_month_birth_rate_2014_version.rda"
output_path_birth_count_county_month_rda_2005 <- "data/county_month_birth_rate_2005_version.rda"
change_month <- as.Date("1996-02-15")
# OuhscMunge::readr_spec_aligned(input_path_census_county_month)
col_types_census <-
readr::cols_only(
`fips` = readr::col_character(),
`county_name` = readr::col_character(),
`year` = readr::col_integer(),
`month` = readr::col_integer(),
`fecund_population` = readr::col_integer()
)
# OuhscMunge::readr_spec_aligned(input_path_birth_count_county_month)
col_types_birth_count <- readr::cols_only(
`county_name` = readr::col_character(),
`year` = readr::col_integer(),
`month` = readr::col_integer(),
`birth_count` = readr::col_integer()
)
# ---- load-data ---------------------------------------------------------------
ds_census <- readr::read_csv(input_path_census_county_month , col_types = col_types_census)
ds_birth_count <- readr::read_csv(input_path_birth_count_county_month, col_types = col_types_birth_count)
# ---- tweak-data --------------------------------------------------------------
# OuhscMunge::column_rename_headstart(ds_census) # Help write `dplyr::select()` call.
ds_census <-
ds_census |>
dplyr::select( # `dplyr::select()` drops columns not included.
fips,
county_name,
year,
month,
fecund_population,
)
# ---- groom-birth-count -------------------------------------------------------
# OuhscMunge::column_rename_headstart(ds_birth_count) # Help write `dplyr::select()` call.
ds_birth_count <-
ds_birth_count |>
dplyr::select( # `dplyr::select()` drops columns not included.
county_name,
year,
month,
birth_count,
) |>
dplyr::mutate(
dummy = TRUE,
year = 1900L + year,
county_name =
dplyr::case_match(
county_name,
"canadian" ~ "canadian",
"clevelan" ~ "cleveland",
"comanche" ~ "comanche",
"creek" ~ "creek",
"logan" ~ "logan",
"mcclain" ~ "mcclain",
"oklahoma" ~ "oklahoma",
"osage" ~ "osage",
"pottawat" ~ "pottawatomie",
"rogers" ~ "rogers",
"tulsa" ~ "tulsa",
"wagoner" ~ "wagoner"
),
)
# ---- join --------------------------------------------------------------------
ds_county_month <-
ds_census |>
dplyr::left_join(
ds_birth_count,
by = c("county_name", "year", "month")
) |>
dplyr::mutate(
date = as.Date(ISOdate(year, month, 15L)),
days_in_month = lubridate::days_in_month(date),
days_in_year = as.integer(365L + lubridate::leap_year(date)),
stage_id = dplyr::if_else(date < change_month, 1L, 2L), # Define pre/post bombing stages (+9 months)
)
testit::assert("All left records should find a right record", all(ds_county_month$dummy))
ds_county_month$dummy <- NULL
rm(ds_census, ds_birth_count, input_path_census_county_month, input_path_birth_count_county_month)
# ---- calculate-gfr -----------------------------------------------------------
# Calculate GFR for the 2005 and the 2014 Versions
ds_county_month_2014 <- ds_county_month # This is what fertility researchers should use.
ds_county_month_2005 <- ds_county_month # This is better for 2014 article, and recreates the 2005 article.
#The 2014 version uses the interpolated
ds_county_month_2014 <-
ds_county_month_2014 |>
dplyr::mutate(
# Adjust for months of unequal days. Each monthly record is multiplied by about 12.
birth_rate_monthly = birth_count / fecund_population * 1000L,
birth_rate = birth_rate_monthly * days_in_year / days_in_month,
) |>
dplyr::mutate(
birth_rate_monthly = round(birth_rate_monthly , 2),
birth_rate = round(birth_rate , 2),
)
# To recreate the 2005 paper, use only the 1990 estimate.
ds_county_month_2005 <-
ds_county_month_2005 |>
dplyr::group_by(fips) |>
dplyr::mutate(
birth_rate_monthly = (birth_count / fecund_population[1] * 1000L),
birth_rate = birth_rate_monthly * days_in_year / days_in_month,
) |>
dplyr::ungroup() |>
dplyr::mutate(
birth_rate_monthly = round(birth_rate_monthly , 2),
birth_rate = round(birth_rate , 2),
)
# library(ggplot2)
# ggplot(ds_county_month, aes(x=date, y=birth_rate, color=factor(fips))) + geom_line() + labs(title="Distributions of County Fertility")
# ggplot(ds_county_month, aes(x=birth_rate, color=factor(fips))) + geom_density()
# filePathOutcomes <- file.path(devtools::inst(name="Wats"), "extdata", "BirthRatesOk.txt")
# dsOld <- read.table(file=filePathOutcomes, header=TRUE, sep="\t", stringsAsFactors=FALSE)
# dsOld$date <- as.Date(dsOld$date) + days(15)
#
# ggplot(ds_county_month[ds_county_month$fips==40109, ], aes(x=date, color=factor(fips))) +
# geom_line(aes(y=birth_rate), color="tomato") +
# geom_line(aes(y=BirthRateUnadjustedFrom1990), color="blue") +
# geom_line(mapping=aes(y=birth_rate), data=dsOld, color="green")
###################
# Write to disk
###################
county_month_birth_rate_2014_version <- ds_county_month_2014
county_month_birth_rate_2005_version <- ds_county_month_2005
readr::write_csv(county_month_birth_rate_2014_version, file = output_path_birth_count_county_month_csv_2014)
readr::write_csv(county_month_birth_rate_2005_version, file = output_path_birth_count_county_month_csv_2005)
base::save( county_month_birth_rate_2014_version, file = output_path_birth_count_county_month_rda_2014, compress="xz")
base::save( county_month_birth_rate_2005_version, file = output_path_birth_count_county_month_rda_2005, compress="xz")