Merge pull request epiforecasts#433 from RichardMN/fix-colombia-data-…

…430-require-rsocrata Alternate fix colombia data 430 which makes RSocrata suggested
RichardMN · Feb 5, 2022 · a580b44 · a580b44
2 parents 3eae8ee + 500ae67
commit a580b44
Show file tree

Hide file tree

Showing 13 changed files with 153 additions and 42 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -101,6 +101,7 @@ Imports:
     stringi,
     stringr,
     tidyr (>= 1.0.0),
+    tidyselect,
     vroom,
     xml2
 Suggests:
@@ -109,6 +110,7 @@ Suggests:
     knitr,
     mockery,
     rmarkdown,
+    RSocrata,
     rvest,
     rworldmap,
     sf,

diff --git a/NAMESPACE b/NAMESPACE
@@ -71,6 +71,7 @@ importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_at)
 importFrom(dplyr,mutate_if)
+importFrom(dplyr,n)
 importFrom(dplyr,na_if)
 importFrom(dplyr,pull)
 importFrom(dplyr,recode)
@@ -94,6 +95,7 @@ importFrom(jsonlite,fromJSON)
 importFrom(lifecycle,deprecate_stop)
 importFrom(lubridate,as_date)
 importFrom(lubridate,dmy)
+importFrom(lubridate,dmy_hms)
 importFrom(lubridate,mdy)
 importFrom(lubridate,month)
 importFrom(lubridate,year)
@@ -132,6 +134,7 @@ importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,replace_na)
 importFrom(tidyr,separate)
+importFrom(tidyselect,vars_select_helpers)
 importFrom(utils,download.file)
 importFrom(utils,untar)
 importFrom(vroom,vroom)

diff --git a/NEWS.md b/NEWS.md
@@ -20,6 +20,7 @@ This release is currently under development
 - Fixed a bug in the data sourced from Germany so that instead of treating it as a line list of individuals it is treated as a relatively finely resolved count data which needs to be summed up (by @sbfnk).
 - Fixed a bug in the Vietnam class due to `stringr` ([#448](https://github.com/epiforecasts/covidregionaldata/pull/448) by @RichardMN).
 - Fixed a bug with the Netherlands class were the lack of Hospitalisation data in the source was causing the class to fail ([#446](https://github.com/epiforecasts/covidregionaldata/pull/446) by @RichardMN).
+- Fixed an issue with the Colombia data and reduced dependencies by making `RSocrata` be a suggested package ([#433](https://github.com/epiforecasts/covidregionaldata/pull/433) by @RichardMN).
 
 ## Depreciations
 

diff --git a/R/Colombia.R b/R/Colombia.R
@@ -3,7 +3,7 @@
 #'  and processing COVID-19 region data for Colombia
 #'
 # nolint start
-#' @source \url{https://github.com/danielcs88/colombia_covid-19/}
+#' @source \url{https://www.datos.gov.co/Salud-y-Protecci-n-Social/Casos-positivos-de-COVID-19-en-Colombia/gt2j-8ykr}
 # nolint end
 #' @export
 #' @concept dataset
@@ -21,63 +21,99 @@ Colombia <- R6::R6Class("Colombia",
     #' @field origin name of origin to fetch data for
     origin = "Colombia",
     #' @field supported_levels A list of supported levels.
-    supported_levels = list("1"),
+    supported_levels = list("1", "2"),
     #' @field supported_region_names A list of region names in order of level.
-    supported_region_names = list("1" = "departamento"),
+    supported_region_names = list(
+      "1" = "departamento",
+      "2" = "municipio"
+    ),
     #' @field supported_region_codes A list of region codes in order of level.
-    supported_region_codes = list("1" = "iso_3166_2"),
+    supported_region_codes = list(
+      "1" = "iso_3166_2",
+      "2" = "codigo_municipio"
+    ),
     #' @field common_data_urls List of named links to raw data.
     # nolint start
     common_data_urls = list(
-      "main" = "https://raw.githubusercontent.com/danielcs88/colombia_covid-19/master/datos/cronologia.csv"
+      "main" = "https://www.datos.gov.co/resource/gt2j-8ykr.csv?$select=fecha_diagnostico,ciudad_municipio"
     ),
     # nolint end
     #' @field source_data_cols existing columns within the raw data
-    source_data_cols = c("cases_total"),
+    source_data_cols = c("cases_new"),
     #' @field source_text Plain text description of the source of the data
-    source_text = "Daniel C\u00e1rdenas",
+    source_text = "Datos abiertos Colombia (Colombia open data)",
     #' @field source_url Website address for explanation/introduction of the
     #' data
-    source_url = "https://github.com/danielcs88/colombia_covid-19/",
+    source_url = "https://www.datos.gov.co/Salud-y-Protecci-n-Social/Casos-positivos-de-COVID-19-en-Colombia/gt2j-8ykr", # nolint
 
     #' @description Set up a table of region codes for clean data
     #' @importFrom dplyr mutate
     set_region_codes = function() {
-      self$codes_lookup$`1` <- covidregionaldata::colombia_codes
+      self$codes_lookup$`1` <- covidregionaldata::colombia_codes %>%
+        select(level_1_region, level_1_region_code) %>%
+        unique()
+      self$codes_lookup$`2` <- covidregionaldata::colombia_codes
+    },
+
+    #' @description Colombia specific download using Socrata API
+    #' This uses the `RSocrata` package if it is installed or downloads
+    #' a much larger csv file if that package is not available.
+    #' @importFrom dplyr select
+    download = function() {
+      message_verbose(self$verbose,
+                      "Downloading Colombia data. This may take a while.")
+      # RSocrata package is recommended but not required
+      if (requireNamespace("RSocrata", quietly = self$verbose)) {
+        self$data$raw$main <- RSocrata::read.socrata(self$data_urls[["main"]])
+      } else {
+        stop("covidregionaldata::Colombia$download - requires RSocrata package.\n",
+             "Please run install.packages(\"RSocrata\")\n", call.=TRUE)
+      }
     },
 
-    #' @description Colombia specific state level data cleaning
-    #' @importFrom dplyr select mutate
-    #' @importFrom lubridate ymd
+    #' @description Colombia specific data cleaning
+    #' @importFrom dplyr select mutate rename summarise group_by n
+    #' @importFrom lubridate dmy_hms as_date
     #' @importFrom stringr str_replace_all str_to_sentence str_to_title
     #' @importFrom rlang .data
     #'
     clean_common = function() {
       self$data$clean <- self$data$raw[["main"]] %>%
-        select(
-          date = .data$fecha,
-          level_1_region = .data$departamento,
-          cases_total = .data$casos
-        ) %>%
-        mutate(
-          date = ymd(.data$date),
-          level_1_region = iconv(.data$level_1_region,
-            from = "UTF-8",
-            to = "ASCII//TRANSLIT"
-          ),
-          level_1_region = str_replace_all(.data$level_1_region, " D.C.", ""),
-          level_1_region = str_replace_all(
-            .data$level_1_region,
-            "San Andres y Providencia",
-            "San Andres, Providencia y Santa Catalina"
-          ),
-          level_1_region = str_to_sentence(.data$level_1_region),
-          level_1_region = str_to_title(.data$level_1_region)
+        rename(
+          date = .data$fecha_diagnostico,
+          level_2_region_code = .data$ciudad_municipio
         ) %>%
+        group_by(date, level_2_region_code) %>%
+        summarise(cases_new = n(), .groups = "drop") %>%
+        mutate(date = as_date(dmy_hms(date)),
+               level_2_region_code = sprintf("%05d", level_2_region_code)) %>%
         left_join(
-          self$codes_lookup$`1`,
-          by = c("level_1_region" = "level_1_region")
+          self$codes_lookup$`2`,
+          by = c("level_2_region_code" = "level_2_region_code")
         )
+    },
+
+    #' @description Colombia Specific Department Level Data Cleaning
+    #'
+    #' Aggregates data to the level 1 (department) regional level. Data is
+    #' provided by the source at the level 2 (municipality) regional level.
+    #'
+    #' @importFrom dplyr group_by summarise ungroup across select
+    #' @importFrom tidyselect vars_select_helpers
+    clean_level_1 = function() {
+      self$data$clean <- self$data$clean %>%
+        select(-level_2_region_code, -level_2_region) %>%
+        group_by(
+          .data$date,
+          .data$level_1_region, .data$level_1_region_code
+        ) %>%
+        summarise(
+          across(
+            tidyselect::vars_select_helpers$where(is.numeric),
+            sum
+          )
+        ) %>%
+        ungroup()
     }
   )
 )
diff --git a/R/Vietnam.R b/R/Vietnam.R
@@ -104,7 +104,7 @@ Vietnam <- R6::R6Class("Vietnam",
           cases_total,
           deaths_total,
           recovered_total
-          ) %>%
+        ) %>%
         mutate(ncsc_region_code = as.numeric(ncsc_region_code)) %>%
         left_join(
           self$data$raw$provinces %>%
@@ -119,8 +119,6 @@ Vietnam <- R6::R6Class("Vietnam",
           level_1_region = str_replace_all(level_1_region,
                                         "TP HCM", "Hochiminh"),
         ) %>%
-        #
-        #tidyr::drop_na(date, region_name) %>%
         mutate(
           level_1_region = stri_trans_general(level_1_region, "ASCII"),
           level_1_region = stri_trim_both(level_1_region),

diff --git a/data-raw/colombia_codes.R b/data-raw/colombia_codes.R
@@ -22,7 +22,7 @@ level_1_region <- read_html(co_iso) %>%
   html_text()
 level_1_region <- level_1_region[1:33]
 
-colombia_codes <- data.frame(
+colombia_departments <- data.frame(
   level_1_region_code,
   level_1_region,
   stringsAsFactors = FALSE
@@ -37,7 +37,7 @@ colombia_codes <- data.frame(
 replacements <- list(
   "Distrito Capital De Bogota" = "Bogota"
 )
-colombia_codes <- colombia_codes %>%
+colombia_departments <- colombia_departments %>%
   mutate(
     level_1_region = ifelse(level_1_region %in% names(replacements),
       replacements[level_1_region],
@@ -46,5 +46,47 @@ colombia_codes <- colombia_codes %>%
     level_1_region = as.character(level_1_region)
   )
 
+# Download list of municipalities and codes
+# 
+
+colombia_municipalities_sheet <- download_excel(
+  "https://www.dane.gov.co/files/censo2005/provincias/subregiones.xls",
+  "colombia_municipalities.xls",
+  verbose = TRUE,
+  transpose = FALSE,
+  sheet = "Hoja1"
+) 
+  colombia_municipalities <- colombia_municipalities_sheet %>%
+  select(level_2_region = NOM_MPIO,
+         level_2_region_code = DPTOC_MPIO,
+         level_1_region = NOM_DEPTO) %>%
+  mutate(
+    level_1_region = stri_trans_general(level_1_region, "latin-ascii"),
+    level_1_region = stri_trim_both(level_1_region),
+    level_1_region = stringr::str_to_title(level_1_region),
+    level_1_region =
+      str_replace_all(.data$level_1_region,
+                      c(" D.c." = "",
+                        "Archipielago De San Andres"
+                        = "San Andres, Providencia Y Santa Catalina",
+                        "Norte Santander" = "Norte De Santander"
+                      )
+      ),
+    level_2_region = stri_trans_general(level_2_region, "latin-ascii"),
+    level_2_region = stri_trim_both(level_2_region),
+    level_2_region =
+      str_replace_all(.data$level_2_region,
+                      c(" D.C." = ""
+                      )
+      ),
+    level_2_region = stringr::str_to_title(level_2_region),
+  ) 
+
+
+# anti_join(colombia_municipalities, colombia_departments, by=c("level_1_region"))
+colombia_codes <- left_join(colombia_municipalities,
+                            colombia_departments,
+                            by=c("level_1_region"))
+
 # update package region_codes
 usethis::use_data(colombia_codes, overwrite = TRUE)
diff --git a/data/all_country_data.rda b/data/all_country_data.rda
diff --git a/data/colombia_codes.rda b/data/colombia_codes.rda
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -30,6 +30,7 @@ COVID
 covidregionaldata
 cre
 CSSE
+csv
 ctb
 currrently
 DataClass
@@ -132,6 +133,7 @@ rlang
 rmarkdown
 Roxygen
 RoxygenNote
+RSocrata
 rvest
 RVIM
 rworldmap
@@ -144,6 +146,7 @@ seperate
 shapefiles
 sherratt
 Sherratt
+Socrata
 sophie
 SouthAfrica
 spi

diff --git a/man/Colombia.Rd b/man/Colombia.Rd
diff --git a/man/colombia_codes.Rd b/man/colombia_codes.Rd
diff --git a/tests/testthat/custom_data/Colombia_level_1.rds b/tests/testthat/custom_data/Colombia_level_1.rds
diff --git a/tests/testthat/custom_data/Colombia_level_2.rds b/tests/testthat/custom_data/Colombia_level_2.rds