-
Notifications
You must be signed in to change notification settings - Fork 1
/
find_date_cause.R
188 lines (164 loc) · 6.6 KB
/
find_date_cause.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#' Find the first date beyond a cutoff in several columns
#'
#' This function will find the first date in an orderd series of columns that
#' is either before or after a cutoff date, inclusive.
#'
#' @param x a data frame
#' @param ... an ordered series of date columns (i.e. the most important date
#' to be considered first).
#' @param datecol the name of the new column to contain the dates
#' @param datereason the name of the column to contain the name of the column
#' from which the date came.
#' @param period_start,period_end for the find_ functions, this should be the
#' name of a column in `x` that contains the start/end of the recall period.
#' For `constrain_dates`, this should be a vector of dates.
#' @param na_fill one of either "before" or "after" indicating that the new
#' column should only contain dates before or after the cutoff date.
#' @export
#'
#' @examples
#' d <- data.frame(
#' s1 = c(as.Date("2013-01-01") + 0:10, as.Date(c("2012-01-01", "2014-01-01"))),
#' s2 = c(as.Date("2013-02-01") + 0:10, as.Date(c("2012-01-01", "2014-01-01"))),
#' s3 = c(as.Date("2013-01-10") - 0:10, as.Date(c("2012-01-01", "2014-01-01"))),
#' ps = as.Date("2012-12-31"),
#' pe = as.Date("2013-01-09")
#' )
#' print(dd <- find_date_cause(d, s1, s2, s3, period_start = ps, period_end = pe))
#' print(bb <- find_date_cause(d, s1, s2, s3, period_start = ps, period_end = pe,
#' na_fill = "end",
#' datecol = "enddate",
#' datereason = "endcause"))
#' find_date_cause(d, s3, s2, s1, period_start = ps, period_end = pe)
#'
#' # works
#' assert_positive_timespan(dd, start_date, pe)
#'
#' # returns a warning because the last date isn't later than the start_date
#' assert_positive_timespan(dd, start_date, s2)
#'
#'
#' with(d, constrain_dates(s1, ps, pe))
#' with(d, constrain_dates(s2, ps, pe))
#' with(d, constrain_dates(s3, ps, pe))
#'
find_date_cause <- function(x,
...,
period_start = NULL,
period_end = NULL,
datecol = "start_date",
datereason = "start_date_reason",
na_fill = "start") {
na_fill <- match.arg(tolower(na_fill), c("start", "end"))
.dots <- tidyselect::vars_select(colnames(x), ...)
period_start <- rlang::enquo(period_start)
period_end <- rlang::enquo(period_end)
period_start <- tidyselect::vars_select(colnames(x), !! period_start)
period_end <- tidyselect::vars_select(colnames(x), !! period_end)
are_dates <- vapply(x[c(.dots, period_start, period_end)], inherits, logical(1), "Date")
if (!all(are_dates)) {
stop("All columns in ..., period_start, and period_end must be dates")
}
y <- x[c(.dots, period_start, period_end)]
# removing dates that don't conform
y <- dplyr::mutate_at(.tbl = y,
.vars = .dots,
.funs = constrain_dates,
period_start = y[[period_start]],
period_end = y[[period_end]],
boundary = na_fill)
y <- choose_first_good_date(y[.dots])
tibble::add_column(!! rlang::sym(datecol) := y[[1]],
!! rlang::sym(datereason) := y[[2]],
.data = x,
.before = .dots[[1]])
}
#' @rdname find_date_cause
#' @export
find_start_date <- function(x, ...,
period_start = NULL,
period_end = NULL,
datecol = "start_date",
datereason = "start_date_reason"
) {
find_date_cause(x, ...,
period_start = !! rlang::enquo(period_start),
period_end = !! rlang::enquo(period_end),
datecol = datecol,
datereason = datereason,
na_fill = "start")
}
#' @rdname find_date_cause
#' @export
find_end_date <- function(x, ...,
period_start = NULL,
period_end = NULL,
datecol = "end_date",
datereason = "end_date_reason"
) {
find_date_cause(x, ...,
period_start = !! rlang::enquo(period_start),
period_end = !! rlang::enquo(period_end),
datecol = datecol,
datereason = datereason,
na_fill = "end")
}
#' @rdname find_date_cause
#' @param i a vector of dates
#' @param boundary one of "both", "start", or "end". Dates outside of the
#' boundary will be set to NA.
#' @export
constrain_dates <- function(i, period_start, period_end, boundary = "both") {
boundary <- match.arg(boundary, c("both", "start", "end"))
nna <- !is.na(i)
too_early <- nna & i < period_start
too_late <- nna & i > period_end
if (boundary != "both") {
at_the_beginning <- boundary == "start"
trim <- if (at_the_beginning) too_early else too_late
repl <- if (at_the_beginning) period_start else period_end
i[trim] <- repl[trim]
}
i[nna & (i < period_start | i > period_end)] <- NA
i
}
#' Choose the first non-missing date from a data frame of dates
#'
#' @param date_a_frame a data frame where each column contains a different
#' parsing of the same date vector
#' @keywords internal
#' @noRd
#' @note: This function was written and modified by Zhian N. Kamvar and comes
#' from the linelist package,
choose_first_good_date <- function(date_a_frame) {
n <- nrow(date_a_frame)
res <- data.frame(the_date = rep(as.Date(NA), length = n),
the_col = character(n),
stringsAsFactors = FALSE
)
for (i in seq_len(n)) {
tmp <- date_a_frame[i, ]
suppressWarnings(nona <- min(which(!is.na(tmp))))
if (!is.finite(nona)) {
next
}
res$the_date[i] <- tmp[[nona]]
res$the_col[i] <- names(date_a_frame)[nona]
}
res
}
#' @param date_start,date_end column name of a date vector
#' @rdname find_date_cause
#' @export
assert_positive_timespan <- function(x, date_start, date_end) {
ds <- tidyselect::vars_select(colnames(x), !! rlang::enquo(date_start))
de <- tidyselect::vars_select(colnames(x), !! rlang::enquo(date_end ))
res <- x[[de]] - x[[ds]]
all_right <- all(res >= 0, na.rm = TRUE)
if (!all_right) {
y <- x[res < 0, , drop = FALSE]
warning(sprintf("%d rows had negative timespans", nrow(y)), immediate. = TRUE)
return(y)
}
return(invisible(NULL))
}