Skip to content

Commit

Permalink
Merge pull request epiforecasts#433 from RichardMN/fix-colombia-data-…
Browse files Browse the repository at this point in the history
…430-require-rsocrata

Alternate fix colombia data 430 which makes RSocrata suggested
  • Loading branch information
seabbs committed Feb 5, 2022
2 parents 3eae8ee + 500ae67 commit a580b44
Show file tree
Hide file tree
Showing 13 changed files with 153 additions and 42 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Expand Up @@ -101,6 +101,7 @@ Imports:
stringi,
stringr,
tidyr (>= 1.0.0),
tidyselect,
vroom,
xml2
Suggests:
Expand All @@ -109,6 +110,7 @@ Suggests:
knitr,
mockery,
rmarkdown,
RSocrata,
rvest,
rworldmap,
sf,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Expand Up @@ -71,6 +71,7 @@ importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,mutate_at)
importFrom(dplyr,mutate_if)
importFrom(dplyr,n)
importFrom(dplyr,na_if)
importFrom(dplyr,pull)
importFrom(dplyr,recode)
Expand All @@ -94,6 +95,7 @@ importFrom(jsonlite,fromJSON)
importFrom(lifecycle,deprecate_stop)
importFrom(lubridate,as_date)
importFrom(lubridate,dmy)
importFrom(lubridate,dmy_hms)
importFrom(lubridate,mdy)
importFrom(lubridate,month)
importFrom(lubridate,year)
Expand Down Expand Up @@ -132,6 +134,7 @@ importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
importFrom(tidyr,replace_na)
importFrom(tidyr,separate)
importFrom(tidyselect,vars_select_helpers)
importFrom(utils,download.file)
importFrom(utils,untar)
importFrom(vroom,vroom)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Expand Up @@ -20,6 +20,7 @@ This release is currently under development
- Fixed a bug in the data sourced from Germany so that instead of treating it as a line list of individuals it is treated as a relatively finely resolved count data which needs to be summed up (by @sbfnk).
- Fixed a bug in the Vietnam class due to `stringr` ([#448](https://github.com/epiforecasts/covidregionaldata/pull/448) by @RichardMN).
- Fixed a bug with the Netherlands class were the lack of Hospitalisation data in the source was causing the class to fail ([#446](https://github.com/epiforecasts/covidregionaldata/pull/446) by @RichardMN).
- Fixed an issue with the Colombia data and reduced dependencies by making `RSocrata` be a suggested package ([#433](https://github.com/epiforecasts/covidregionaldata/pull/433) by @RichardMN).

## Depreciations

Expand Down
102 changes: 69 additions & 33 deletions R/Colombia.R
Expand Up @@ -3,7 +3,7 @@
#' and processing COVID-19 region data for Colombia
#'
# nolint start
#' @source \url{https://github.com/danielcs88/colombia_covid-19/}
#' @source \url{https://www.datos.gov.co/Salud-y-Protecci-n-Social/Casos-positivos-de-COVID-19-en-Colombia/gt2j-8ykr}
# nolint end
#' @export
#' @concept dataset
Expand All @@ -21,63 +21,99 @@ Colombia <- R6::R6Class("Colombia",
#' @field origin name of origin to fetch data for
origin = "Colombia",
#' @field supported_levels A list of supported levels.
supported_levels = list("1"),
supported_levels = list("1", "2"),
#' @field supported_region_names A list of region names in order of level.
supported_region_names = list("1" = "departamento"),
supported_region_names = list(
"1" = "departamento",
"2" = "municipio"
),
#' @field supported_region_codes A list of region codes in order of level.
supported_region_codes = list("1" = "iso_3166_2"),
supported_region_codes = list(
"1" = "iso_3166_2",
"2" = "codigo_municipio"
),
#' @field common_data_urls List of named links to raw data.
# nolint start
common_data_urls = list(
"main" = "https://raw.githubusercontent.com/danielcs88/colombia_covid-19/master/datos/cronologia.csv"
"main" = "https://www.datos.gov.co/resource/gt2j-8ykr.csv?$select=fecha_diagnostico,ciudad_municipio"
),
# nolint end
#' @field source_data_cols existing columns within the raw data
source_data_cols = c("cases_total"),
source_data_cols = c("cases_new"),
#' @field source_text Plain text description of the source of the data
source_text = "Daniel C\u00e1rdenas",
source_text = "Datos abiertos Colombia (Colombia open data)",
#' @field source_url Website address for explanation/introduction of the
#' data
source_url = "https://github.com/danielcs88/colombia_covid-19/",
source_url = "https://www.datos.gov.co/Salud-y-Protecci-n-Social/Casos-positivos-de-COVID-19-en-Colombia/gt2j-8ykr", # nolint

#' @description Set up a table of region codes for clean data
#' @importFrom dplyr mutate
set_region_codes = function() {
self$codes_lookup$`1` <- covidregionaldata::colombia_codes
self$codes_lookup$`1` <- covidregionaldata::colombia_codes %>%
select(level_1_region, level_1_region_code) %>%
unique()
self$codes_lookup$`2` <- covidregionaldata::colombia_codes
},

#' @description Colombia specific download using Socrata API
#' This uses the `RSocrata` package if it is installed or downloads
#' a much larger csv file if that package is not available.
#' @importFrom dplyr select
download = function() {
message_verbose(self$verbose,
"Downloading Colombia data. This may take a while.")
# RSocrata package is recommended but not required
if (requireNamespace("RSocrata", quietly = self$verbose)) {
self$data$raw$main <- RSocrata::read.socrata(self$data_urls[["main"]])
} else {
stop("covidregionaldata::Colombia$download - requires RSocrata package.\n",
"Please run install.packages(\"RSocrata\")\n", call.=TRUE)
}
},

#' @description Colombia specific state level data cleaning
#' @importFrom dplyr select mutate
#' @importFrom lubridate ymd
#' @description Colombia specific data cleaning
#' @importFrom dplyr select mutate rename summarise group_by n
#' @importFrom lubridate dmy_hms as_date
#' @importFrom stringr str_replace_all str_to_sentence str_to_title
#' @importFrom rlang .data
#'
clean_common = function() {
self$data$clean <- self$data$raw[["main"]] %>%
select(
date = .data$fecha,
level_1_region = .data$departamento,
cases_total = .data$casos
) %>%
mutate(
date = ymd(.data$date),
level_1_region = iconv(.data$level_1_region,
from = "UTF-8",
to = "ASCII//TRANSLIT"
),
level_1_region = str_replace_all(.data$level_1_region, " D.C.", ""),
level_1_region = str_replace_all(
.data$level_1_region,
"San Andres y Providencia",
"San Andres, Providencia y Santa Catalina"
),
level_1_region = str_to_sentence(.data$level_1_region),
level_1_region = str_to_title(.data$level_1_region)
rename(
date = .data$fecha_diagnostico,
level_2_region_code = .data$ciudad_municipio
) %>%
group_by(date, level_2_region_code) %>%
summarise(cases_new = n(), .groups = "drop") %>%
mutate(date = as_date(dmy_hms(date)),
level_2_region_code = sprintf("%05d", level_2_region_code)) %>%
left_join(
self$codes_lookup$`1`,
by = c("level_1_region" = "level_1_region")
self$codes_lookup$`2`,
by = c("level_2_region_code" = "level_2_region_code")
)
},

#' @description Colombia Specific Department Level Data Cleaning
#'
#' Aggregates data to the level 1 (department) regional level. Data is
#' provided by the source at the level 2 (municipality) regional level.
#'
#' @importFrom dplyr group_by summarise ungroup across select
#' @importFrom tidyselect vars_select_helpers
clean_level_1 = function() {
self$data$clean <- self$data$clean %>%
select(-level_2_region_code, -level_2_region) %>%
group_by(
.data$date,
.data$level_1_region, .data$level_1_region_code
) %>%
summarise(
across(
tidyselect::vars_select_helpers$where(is.numeric),
sum
)
) %>%
ungroup()
}
)
)
4 changes: 1 addition & 3 deletions R/Vietnam.R
Expand Up @@ -104,7 +104,7 @@ Vietnam <- R6::R6Class("Vietnam",
cases_total,
deaths_total,
recovered_total
) %>%
) %>%
mutate(ncsc_region_code = as.numeric(ncsc_region_code)) %>%
left_join(
self$data$raw$provinces %>%
Expand All @@ -119,8 +119,6 @@ Vietnam <- R6::R6Class("Vietnam",
level_1_region = str_replace_all(level_1_region,
"TP HCM", "Hochiminh"),
) %>%
#
#tidyr::drop_na(date, region_name) %>%
mutate(
level_1_region = stri_trans_general(level_1_region, "ASCII"),
level_1_region = stri_trim_both(level_1_region),
Expand Down
46 changes: 44 additions & 2 deletions data-raw/colombia_codes.R
Expand Up @@ -22,7 +22,7 @@ level_1_region <- read_html(co_iso) %>%
html_text()
level_1_region <- level_1_region[1:33]

colombia_codes <- data.frame(
colombia_departments <- data.frame(
level_1_region_code,
level_1_region,
stringsAsFactors = FALSE
Expand All @@ -37,7 +37,7 @@ colombia_codes <- data.frame(
replacements <- list(
"Distrito Capital De Bogota" = "Bogota"
)
colombia_codes <- colombia_codes %>%
colombia_departments <- colombia_departments %>%
mutate(
level_1_region = ifelse(level_1_region %in% names(replacements),
replacements[level_1_region],
Expand All @@ -46,5 +46,47 @@ colombia_codes <- colombia_codes %>%
level_1_region = as.character(level_1_region)
)

# Download list of municipalities and codes
#

colombia_municipalities_sheet <- download_excel(
"https://www.dane.gov.co/files/censo2005/provincias/subregiones.xls",
"colombia_municipalities.xls",
verbose = TRUE,
transpose = FALSE,
sheet = "Hoja1"
)
colombia_municipalities <- colombia_municipalities_sheet %>%
select(level_2_region = NOM_MPIO,
level_2_region_code = DPTOC_MPIO,
level_1_region = NOM_DEPTO) %>%
mutate(
level_1_region = stri_trans_general(level_1_region, "latin-ascii"),
level_1_region = stri_trim_both(level_1_region),
level_1_region = stringr::str_to_title(level_1_region),
level_1_region =
str_replace_all(.data$level_1_region,
c(" D.c." = "",
"Archipielago De San Andres"
= "San Andres, Providencia Y Santa Catalina",
"Norte Santander" = "Norte De Santander"
)
),
level_2_region = stri_trans_general(level_2_region, "latin-ascii"),
level_2_region = stri_trim_both(level_2_region),
level_2_region =
str_replace_all(.data$level_2_region,
c(" D.C." = ""
)
),
level_2_region = stringr::str_to_title(level_2_region),
)


# anti_join(colombia_municipalities, colombia_departments, by=c("level_1_region"))
colombia_codes <- left_join(colombia_municipalities,
colombia_departments,
by=c("level_1_region"))

# update package region_codes
usethis::use_data(colombia_codes, overwrite = TRUE)
Binary file modified data/all_country_data.rda
Binary file not shown.
Binary file modified data/colombia_codes.rda
Binary file not shown.
3 changes: 3 additions & 0 deletions inst/WORDLIST
Expand Up @@ -30,6 +30,7 @@ COVID
covidregionaldata
cre
CSSE
csv
ctb
currrently
DataClass
Expand Down Expand Up @@ -132,6 +133,7 @@ rlang
rmarkdown
Roxygen
RoxygenNote
RSocrata
rvest
RVIM
rworldmap
Expand All @@ -144,6 +146,7 @@ seperate
shapefiles
sherratt
Sherratt
Socrata
sophie
SouthAfrica
spi
Expand Down
32 changes: 29 additions & 3 deletions man/Colombia.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/colombia_codes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified tests/testthat/custom_data/Colombia_level_1.rds
Binary file not shown.
Binary file added tests/testthat/custom_data/Colombia_level_2.rds
Binary file not shown.

0 comments on commit a580b44

Please sign in to comment.