/
na_mean.R
253 lines (216 loc) · 7.85 KB
/
na_mean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#' @title Missing Value Imputation by Mean Value
#'
#' @description Missing value replacement by mean values. Different means
#' like median, mean, mode possible.
#'
#' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}})
#' object in which missing values shall be replaced
#'
#' @param option Algorithm to be used. Accepts the following input:
#' \itemize{
#' \item{"mean" - take the mean for imputation (default choice)}
#' \item{"median" - take the median for imputation}
#' \item{"mode" - take the mode for imputation}
#' \item{"harmonic" - take the harmonic mean}
#' \item{"geometric" - take the geometric mean}
#' }
#'
#' @param maxgap Maximum number of successive NAs to still perform imputation on.
#' Default setting is to replace all NAs without restrictions. With this
#' option set, consecutive NAs runs, that are longer than 'maxgap' will
#' be left NA. This option mostly makes sense if you want to
#' treat long runs of NA afterwards separately.
#'
#' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}})
#' object (dependent on given input at parameter x)
#'
#' @details Missing values get replaced by overall mean values. The function
#' calculates the mean, median, mode, harmonic or geometric mean over all the non-NA
#' values and replaces all NAs with this value. Option 'mode' replaces NAs with
#' the most frequent value in the time series. If two or more values occur equally frequent,
#' the function imputes the lower value. Due to their calculation formula geometric and harmonic
#' mean are not well defined for negative values or zero values in the input series.
#'
#' In general using the mean for imputation imputation is mostly a suboptimal choice and should
#' be handled with great caution.
#'
#' @author Steffen Moritz
#'
#' @seealso \code{\link[imputeTS]{na_interpolation}},
#' \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}},
#' \code{\link[imputeTS]{na_ma}},
#' \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}},
#' \code{\link[imputeTS]{na_seadec}}, \code{\link[imputeTS]{na_seasplit}}
#'
#' @examples
#' # Prerequisite: Create Time series with missing values
#' x <- ts(c(2, 3, 4, 5, 6, NA, 7, 8))
#'
#' # Example 1: Perform imputation with the overall mean
#' na_mean(x)
#'
#' # Example 2: Perform imputation with overall median
#' na_mean(x, option = "median")
#'
#' # Example 3: Same as example 1, just written with pipe operator
#' x %>% na_mean()
#' @importFrom magrittr %>%
#' @importFrom stats median ts
#' @export
#'
na_mean <- function(x, option = "mean", maxgap = Inf) {
# Variable 'data' is used for all transformations to the time series
# 'x' needs to stay unchanged to be able to return the same ts class in the end
data <- x
#----------------------------------------------------------
# Mulivariate Input
# The next 20 lines are just for checking and handling multivariate input.
#----------------------------------------------------------
# Check if the input is multivariate
if (!is.null(dim(data)[2]) && dim(data)[2] > 1) {
# Go through columns and impute them by calling this function with univariate input
for (i in 1:dim(data)[2]) {
if (!anyNA(data[, i])) {
next
}
# if imputing a column does not work - mostly because it is not numeric - the column is left unchanged
tryCatch(
data[, i] <- na_mean(data[, i], option, maxgap),
error = function(cond) {
warning(paste(
"na_mean: No imputation performed for column", i, "of the input dataset.
Reason:", cond[1]
), call. = FALSE)
}
)
}
return(data)
}
#----------------------------------------------------------
# Univariate Input
# All relveant imputation / pre- postprocessing code is within this part
#----------------------------------------------------------
else {
missindx <- is.na(data)
##
## 1. Input Check and Transformation
##
# 1.1 Check if NAs are present
if (!anyNA(data)) {
return(x)
}
# 1.2 special handling data types
if (any(class(data) == "tbl")) {
data <- as.vector(as.data.frame(data)[, 1])
}
# 1.3 Check for algorithm specific minimum amount of non-NA values
if (all(missindx)) {
stop("Input data has only NA values. At least 1 non-NA data point required in the time series to apply na_mean.")
}
# 1.4 Checks and corrections for wrong data dimension
# Check if input dimensionality is not as expected
if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) {
stop("Wrong input type for parameter x.")
}
# Altering multivariate objects with 1 column (which are essentially
# univariate) to be dim = NULL
if (!is.null(dim(data)[2])) {
data <- data[, 1]
}
# 1.5 Check if input is numeric
if (!is.numeric(data)) {
stop("Input x is not numeric.")
}
##
## End Input Check and Transformation
##
##
## 2. Imputation Code
##
if (option == "median") {
# Use Median
median <- stats::median(data, na.rm = TRUE)
data[missindx] <- median
}
else if (option == "mode") {
# Calculate Mode
temp <- table(as.vector(data))
mode <- names(temp)[temp == max(temp)]
mode <- (as.numeric(mode))[1]
data[missindx] <- mode
}
else if (option == "mean") {
# Use arithmetic Mean
mean <- mean(data, na.rm = TRUE)
data[missindx] <- mean
}
else if (option == "geometric") {
# Use geometric Mean
# Check preconditions
if (any(data == 0 | data < 0, na.rm = T)) {
stop(
"The input data contains 0 and/or negative values.\n",
"The geometric and harmonic mean are not well defined for these cases.\n",
"Please another option like e.g. option = 'mean' in this case."
)
}
mean <- exp(mean(log(data), na.rm = TRUE))
data[missindx] <- mean
}
else if (option == "harmonic") {
# Use harmonic Mean
# Check preconditions
if (any(data == 0 | data < 0, na.rm = T)) {
stop(
"The input data contains 0 and/or negative values.\n",
"The geometric and harmonic mean are not well defined for these cases.\n",
"Please another option like e.g. option = 'mean' in this case."
)
}
mean <- 1 / mean(1 / data, na.rm = TRUE)
data[missindx] <- mean
}
else {
stop("Wrong 'option' parameter given, must be either: \n'mean', 'mode', 'median', 'harmonic' or 'geometric'.")
}
##
## End Imputation Code
##
##
## 3. Post Processing
##
# 3.1 Check for Maxgap option
# If maxgap = Inf then do nothing and when maxgap is lower than 0
if (is.finite(maxgap) && maxgap >= 0) {
# Get logical vector of the time series via is.na() and then get the
# run-length encoding of it. The run-length encoding describes how long
# the runs of FALSE and TRUE are
rlencoding <- rle(is.na(x))
# Runs smaller than maxgap (which shall still be imputed) are set FALSE
rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE
# The original vector is being reconstructed by reverse.rls, only now the
# longer runs are replaced now in the logical vector derived from is.na()
# in the beginning all former NAs that are > maxgap are also FALSE
en <- inverse.rle(rlencoding)
# Set all positions in the imputed series with gaps > maxgap to NA
# (info from en vector)
data[en == TRUE] <- NA
}
##
## End Post Processing
##
##
## 4. Final Output Formatting
##
# Give back the object originally supplied to the function
# (necessary for multivariate input with only 1 column)
if (!is.null(dim(x)[2])) {
x[, 1] <- data
return(x)
}
##
## End Final Output Formatting
##
return(data)
}
}