/
read-smn.R
248 lines (215 loc) · 9.54 KB
/
read-smn.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#' Reads a file from the SwissMetNet of MeteoSwiss
#'
#' @description
#' [SwissMetNet](https://www.meteoswiss.admin.ch/home/measurement-and-forecasting-systems/land-based-stations/automatisches-messnetz.html),
#' the automatic monitoring network of [MeteoSwiss](https://www.meteoschweiz.admin.ch) the Federal Office for
#' Meteorology and Climatology, comprises about 160 automatic monitoring stations. These stations deliver a multitude
#' of current data on the weather and climate in Switzerland every ten minutes. The monitoring network is supplemented
#' by automatic precipitation stations.
#'
#' This Function autodetects the delimeter and tries to find the interval. In Addition the time information in the
#' files is utc and end time. The time is converted to start time and the time zone defined trough the argument tz.
#' The argument time_shift provides a way to manuelly shift the time series. In this case *no* automatically shifting
#' is applied. The provided values is directly added to information in the file.
#'
#' @param fn path to input file
#' @param tz of the output data. Default "Etc/GMT-1"
#' @param encoding encoding of the data file. Default = "UTF-8"
#' @param time_shift a lubridate period to add to the time. Default NULL
#' @param time_format optional time_format. Use if auto detect fails. Default NULL
#' @param interval optional interval of the data. Use if auto detect fails. Default NULL. If used it is necessary to
#' define time_shift manuelly. lubridate::period(0) can be used for no shifting
#' @param na.rm remove na values. Default TRUE
#'
#' @return tibble in rOstluft long format structure
#'
#' @export
#'
#' @examples
#' input <- system.file("extdata", "smn.txt", package = "rOstluft.data", mustWork = TRUE)
#' read_smn(input)
#'
read_smn <- function(fn, tz = "Etc/GMT-1", encoding = "UTF-8", time_shift = NULL, time_format = NULL,
interval = NULL, na.rm = TRUE) {
locale <- readr::locale(encoding = encoding)
col_types <- readr::cols(
X1 = readr::col_character(),
X2 = readr::col_character(),
.default = readr::col_number()
)
# smn files tend to have different headers, but there are always the col name starting mit stn
header <- readr::read_lines(fn, n_max = 20)
start_line <- purrr::detect_index(header, ~ any(stringr::str_detect(., c("stn", "Sta."))))
if (start_line == 0) {
stop("couldn't find a line starting with stn")
}
col_names <- header[start_line]
# has the file an unit line?
if (stringr::str_count(header[start_line + 1], "\\[") > 0) {
skip <- start_line + 1
units <- header[start_line + 1]
units <- stringr::str_replace_all(units, "\\[|\\]", "")
} else {
skip <- start_line
units <- NULL
}
if (stringr::str_count(col_names, " ") > 4) {
col_names <- stringr::str_split(col_names, "\\s+")[[1]]
if (!is.null(units)) {
units <- stringr::str_split(units, "\\s+")[[1]]
}
data <- readr::read_table(fn, FALSE, col_types, locale, "-", skip, skip_empty_rows = TRUE)
} else if (stringr::str_count(col_names, ";") >= 2) {
col_names <- stringr::str_split(col_names, ";")[[1]]
if (!is.null(units)) {
units <- stringr::str_split(units, ";")[[1]]
}
data <- readr::read_delim(fn, ";", col_types = col_types, col_names = FALSE, locale = locale, na = "-",
skip = skip, skip_empty_rows = TRUE)
} else {
stop("couldn't detect delimiter")
}
# check for empty columnes
empty_cols <- which(col_names == "")
if(length(empty_cols) != 0) {
col_names<- col_names[-empty_cols]
data <- dplyr::select(data, -!!empty_cols)
}
# normalize col_names
col_names[stringr::str_detect(col_names, "Sta.")] <- "stn"
col_names[stringr::str_detect(col_names, "Date")] <- "time"
data <- rlang::set_names(data, col_names)
if (is.null(time_format)) {
time_length <- stringr::str_length(data$time[1]) # are we too clever with format detection?
time_format <- stringr::str_sub("%Y%m%d%H%M", 1, time_length - 2) # Y has 4 digits but only two in format -> -2
}
data <- dplyr::mutate(data,
time = lubridate::with_tz(lubridate::fast_strptime(data$time, time_format, lt = FALSE), tz)
)
if (nrow(data) < 2 && is.null(interval)) {
stop("couldn't detect interval. use argument interval")
} else if (is.null(interval)) {
duration <- lubridate::as.duration(data$time[2] - data$time[1])
interval <- lubridate::time_length(duration, unit = "minutes")
interval <- switch(as.character(interval), "10" = "min10", "30" = "min30", "60" = "h1", "1440" = "d1",
stop("couldn't detect interval. use argument interval"))
} else if (!lubridate::is.period(time_shift)) {
stop(stringr::str_c("If argument interval is used, time_shift is necessary! ",
"time_shift = lubridate::period(0) can be used for no shifting"))
} else {
interval <- interval
}
if (lubridate::is.period(time_shift)) {
data <- dplyr::mutate(data, time = .data$time + time_shift)
} else if (is.null(time_shift)) {
data <- dplyr::mutate(data, time = .data$time - duration)
} else {
stop("time_shift has to be a lubridate::period or NULL")
}
data <- tidyr::gather(data, "parameter", "value", -"time", -"stn", na.rm = na.rm)
data <- dplyr::mutate(data,
stn = forcats::as_factor(.data$stn),
parameter = forcats::as_factor(.data$parameter),
interval = forcats::as_factor(interval),
unit = factor(NA)
)
if (!is.null(units)) {
parameters <- utils::tail(col_names, -2) # the first two col_names are stn and time not parameters
units <- utils::tail(units, length(parameters)) # just take the number of parameters form the end
units <- rlang::set_names(units, parameters)
data <- dplyr::mutate(data, unit = dplyr::recode_factor(.data$parameter, !!!units))
}
dplyr::select(data, starttime = "time", site = "stn", "parameter", "interval", "unit", "value")
}
#' Reads a file from the SwissMetNet of MeteoSwiss containing multiple Exports
#'
#' @description
#' One peculiarity of MeteoSwiss is to provide multiple exports concatenated in one file, starting with a line
#' containing a single space and seperated by two newlines. This function reads the whole file into the memory, splits
#' the part and applies the read function to each part. Is the file too big for the available memory, the function
#' [split_smn()] can split the file into multiple files each containing one part. On Linux the commandline tool csplit
#' is another probably faster tool.
#'
#' @seealso
#' * [read_smn()]
#' * [split_smn()]
#' * [csplit](http://man7.org/linux/man-pages/man1/csplit.1.html) - split a file into sections determined by context
#' lines
#'
#' @param fn path to input file
#' @param as_list return a list with each part as element rather than one concatenated tibble
#' @param encoding encoding of the data file. Default = "UTF-8"
#' @param ... parameters passed to [read_smn()]
#'
#' @return tibble or list of tibbles in rOstluft long format structure
#'
#' @export
#'
#' @examples
#' fn <- system.file("extdata", "smn_multi.txt", package = "rOstluft.data", mustWork = TRUE)
#' read_smn_multiple(fn, as_list = TRUE)
#'
read_smn_multiple <- function(fn, as_list = FALSE, encoding = "UTF-8", ...) {
data <- readr::read_file(fn, readr::locale(encoding = encoding))
data <- stringr::str_split(data, "\r\n\r\n|\n\n") # line end conversion happens
data <- purrr::keep(data[[1]], ~ any(stringr::str_detect(., c("stn", "Sta.")))) # remove empty and chunks with only a space
data <- purrr::map(data, ~I(.))
data <- purrr::map(data, read_smn, encoding = encoding, ...)
if (isFALSE(as_list)) {
data <- dplyr::bind_rows(!!!data)
}
data
}
#' Splits a file from the SwissMetNet of MeteoSwiss containing multiple Exports
#'
#' @param fn path to input file
#' @param out_dir Output directory for parts
#' @param suffix Added suffix to file name as sprintf format string. Gets file counter as argument.
#' @param encoding Encoding of the input file
#'
#' @return NULL
#'
#' @export
split_smn <- function(fn, out_dir = NULL, suffix = "%03d.part", encoding = "UTF-8") {
if (is.null(out_dir)) {
out_dir <- fs::path_dir(file)
}
filename <- fs::path_file(fn)
con_in <- file(fn, open = "r", encoding = encoding)
file_count <- 1
line_count <- 0
out_fn <- fs::path(out_dir, filename, ext = sprintf(suffix, file_count))
fs::dir_create(out_dir)
con_out <- file(out_fn, open = "w", encoding = encoding)
#message("start splitting")
# read first line. drop it if whitespace, else keep it.
# without an additional chunk is genereated with files starting space\n\n
line <- readLines(con_in, n = 1, warn = FALSE)
if (stringr::str_trim(line) != "") {
writeLines(line, con_out)
}
while (length(line <- readLines(con_in, n = 1, warn = FALSE)) > 0) {
# new chunk -> new file
if ((line == "") && (line_count > 0)) {
close(con_out)
file_count <- file_count + 1
message(sprintf("Got %d lines in last chunk. start chunk nr %2d", line_count, file_count))
line_count <- 0
out_fn <- fs::path(out_dir, filename, ext = sprintf(suffix, file_count))
con_out <- file(out_fn, open = "w", encoding = encoding)
}
if (line != "") {
writeLines(line, con_out)
line_count <- line_count + 1
}
}
close(con_in)
close(con_out)
if (line_count > 0) {
message(sprintf("Got %d lines in last chunk. Finished file with %2d chunks", line_count, file_count))
} else {
fs::file_delete(out_fn)
message(sprintf("Last chunk is empty. Finished file with %2d chunks", file_count - 1))
}
invisible(NULL)
}