Merge pull request #202 from OHI-Science/dev

merging into master
OHI-Science · Nov 27, 2023 · 7aba6a5 · 7aba6a5
2 parents 3bb30b7 + 15de3af
commit 7aba6a5
Show file tree

Hide file tree

Showing 37 changed files with 1,199 additions and 60 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -25,7 +25,8 @@ Imports:
     tidyr,
     RColorBrewer,
     htmlwidgets,
-    here
+    here,
+    rlang
 Suggests:
     knitr,
     rmarkdown
@@ -40,6 +41,7 @@ Collate:
     'CheckLayers.R'
     'Conf.R'
     'Layers.R'
+    'layers_eez_base_updater.R'
     'PlotFlower.R'
     'ScoreScaling.R'
     'SelectLayersData.R'
@@ -56,9 +58,10 @@ Collate:
     'rgn_eez_area.R'
     'sovregion_labels.R'
     'sovregions.R'
+    'split_regions.R'
 URL: https://github.com/OHI-Science/ohicore
 BugReports: https://github.com/OHI-Science/ohicore/issues
 LazyData: TRUE
 VignetteBuilder: knitr
-RoxygenNote: 6.1.1
+RoxygenNote: 7.2.3
 Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
@@ -15,13 +15,15 @@ export(SelectLayersData)
 export(WriteRefPoint)
 export(collapse_2_rgn)
 export(compare_scores_df)
+export(layers_eez_base_updater)
 export(mapvalues)
 export(name_2_rgn)
 export(read_git_csv)
 export(score.clamp)
 export(score.max)
 export(score.rescale)
 export(score_check)
+export(split_regions)
 export(trace_git_csv_value)
 exportClasses(Conf)
 exportClasses(Layers)

diff --git a/R/layers_eez_base_updater.R b/R/layers_eez_base_updater.R
@@ -0,0 +1,118 @@
+#' Update layers_eez_base.csv
+#' By Peter Menzies
+#' 
+#' Convenient interface for changing directory paths of recently updated layers in layers_eez_base.csv
+#' No arguments required - user will be prompted for needed information
+#' 
+#'  
+#' @keywords ohi
+#' @export
+
+
+layers_eez_base_updater <- function() {
+
+  require(here)
+  require(tidyverse)
+
+  # read in the csv
+  layers_eez_base <- read_csv(here("metadata_documentation/layers_eez_base.csv"), col_types = cols())
+
+  # prompt user for current version year (gsub() to remove 'v' if user adds it as well)
+  message("")
+  version_year <- paste0("v", gsub("\\D", "", readline(prompt = "enter version year: "))) 
+
+  # empty vector that will later contain layer names - used in while loop
+  possible_layers <- c()
+
+  # loop that ends when a viable goal/subgoal abbr is supplied
+  while (length(possible_layers) == 0) {
+
+    message("")
+    goal <- readline(prompt = "enter the goal/subgoal/prs/res abbreviation for the layers you're updating (e.g. 'np', 'hab', or 'cc'): ") %>% 
+      tolower()
+
+    possible_layers <- layers_eez_base$layer[startsWith(layers_eez_base$layer, goal)]
+
+    if (length(possible_layers) == 0) {
+      message("\nthere are no layers starting with that abbreviation\n")
+    }
+  }
+
+  # the component layers of that goal/subgoal are printed below for user's convenience
+  message("\nthese are the layers associated with that abbreviation:\n") 
+  print(possible_layers)
+  message("\nif you want to update all of these layers, enter 'all' at the next prompt —")
+  message("if you are only updating certain ones you can copypaste the layer names above separated by commas\n")
+
+
+  # prompt user for layers which have been updated
+  updated_layers <- str_split(readline(prompt = "enter 'all' or layers separated only by commas: "), ",")[[1]] %>% 
+    str_remove_all(" ")
+
+  if (tolower(updated_layers[1]) == "all") {
+    updated_layers <- possible_layers
+  } 
+
+
+  # loop that executes or repeats if entries don't match any of the layer names in the goal / subgoal
+  while (length(intersect(updated_layers, possible_layers)) < length(updated_layers)) {
+
+    unknown_layers <- setdiff(updated_layers, possible_layers)
+
+    if (length(unknown_layers) == length(updated_layers)) {
+      message("\nnone of the layers entered coincide with the chosen abbreviation\n")
+    } else if (length(unknown_layers) == 1) {
+      message(paste0("\nthe following entry does not coincide with the chosen abbreviation: \n"))
+      print(unknown_layers)
+      message("")
+    } else {
+      message("\nthe following entries do not coincide with the chosen abbreviation: \n")
+      print(unknown_layers)
+      message("")
+    }
+
+    updated_layers <- str_split(readline(prompt = "enter 'all' or layers separated only by commas: "), ",")[[1]] %>% 
+      str_remove_all(" ")
+
+    if (tolower(updated_layers[1]) == "all") {
+      updated_layers <- possible_layers
+    } 
+  }
+
+  # create df with updated file paths
+  layers_eez_base_updated <- layers_eez_base %>% 
+    mutate(dir = case_when(layer %in% updated_layers ~ 
+                             gsub("v20\\d\\d", version_year, dir),
+                           TRUE ~ dir))
+
+  # vector of dir names which have changed from the original csv
+  updated_dirs <- anti_join(layers_eez_base_updated, layers_eez_base, by = c("layer", "dir")) %>% 
+    rename("dir (UPDATED)" = dir)
+
+  # make sure the chosen year and layers elicited changes
+  if (nrow(updated_dirs) != 0) { 
+    message("\nthe selected 'dir' values will be updated as shown in the data viewer ↑ ")
+    message("do you want to update layers_eez_base.csv with these changes?\n")
+    View(updated_dirs)
+
+    # request for permission to overwrite current csv with version containing updated dirs
+    overwrite <- readline(prompt = "update? ('y' or 'n'): ")
+
+    if (overwrite == "y") {
+      write.csv(layers_eez_base_updated, here("metadata_documentation/layers_eez_base.csv"),
+                row.names = FALSE)
+      message("\nfile has been updated\n")
+
+    } else {
+      message("\nfile was *NOT* updated\n")
+    }
+
+    # if no changes were elicited to any file paths, end function with message
+  } else {
+    message("\nthe chosen file paths already contain that version year - no updates were made\n")
+  }
+
+}
+
+
+
diff --git a/R/name_2_rgn.R b/R/name_2_rgn.R
@@ -27,7 +27,7 @@ name_2_rgn <- function(df_in,   #df_in=empd
     dplyr::ungroup()
 
   ### attach rgn_synonyms; summarize eliminates duplicate rows (same tmp_name 
-  ### and rgn_id) - rgn type not critical?
+  ### and rgn_id)
   syns <- rgn_synonyms %>% 
     dplyr::select(rgn_id = rgn_id_2013, tmp_name = rgn_nam_2013, 
            tmp_type = rgn_typ)
@@ -41,10 +41,23 @@ name_2_rgn <- function(df_in,   #df_in=empd
   ### create a temp field in the target data frame, for the field that is being combined.
   df_in['tmp_name'] <- df_in[fld_name]
 
-  ### replace problematic symbols (accents and such) within target data frame.
+  ### replace problematic symbols within target data frame.
   df_in <- df_in %>% 
     dplyr::mutate(tmp_name = stringr::str_trim(tmp_name),
-           tmp_name = stringr::str_replace(tmp_name, "^'", ""))
+           tmp_name = stringr::str_replace(tmp_name, "^'", "")) %>% 
+    mutate(tmp_name = stringr::str_remove(tmp_name, ",")) %>% 
+    mutate(tmp_name = stringr::str_remove(tmp_name, "'")) %>% 
+    mutate(tmp_name = stringr::str_remove(tmp_name, "´")) %>% 
+    mutate(tmp_name = stringr::str_remove(tmp_name, "’")) %>% 
+    mutate(tmp_name = tolower(tmp_name))
+
+  #turn all of the names to lowercase and remove commas and apostrophes
+  rgn_syn <- rgn_syn %>%
+    mutate(tmp_name = tolower(tmp_name)) %>% 
+    mutate(tmp_name = stringr::str_remove(tmp_name, ",")) %>% 
+    mutate(tmp_name = stringr::str_remove(tmp_name, "'")) %>% 
+    mutate(tmp_name = stringr::str_remove(tmp_name, "´")) %>% 
+    mutate(tmp_name = stringr::str_remove(tmp_name, "’"))
 
   ### combine target data frame with region name data frame;
   ### filter to ones with 'eez' or 'ohi_region' in the type.  
@@ -110,7 +123,7 @@ name_2_rgn <- function(df_in,   #df_in=empd
     duplicated(dups_data, fromLast = TRUE)
 
   if(sum(i_dupes) > 0) {
-    message(sprintf("\nDUPLICATES found. Consider using collapse2rgn to collapse duplicates (function in progress).\n"))
+    message(sprintf("\nDUPLICATES found. Confirm your script consolidates these as appropriate for your data.\n"))
     df_out_dupes <- unique(df_out[i_dupes, fld_name]) 
     print(df_out_dupes)
   } 

diff --git a/R/split_regions.R b/R/split_regions.R
@@ -0,0 +1,120 @@
+#' Split Macro-Regions to OHI Regions
+#' 
+#' This function takes datasets containing country records that include multiple OHI regions 
+#' (macro-regions) and breaks them down into individual OHI regions. It uses population-weighted
+#' values to distribute data across the new regions. When a dataset contains both a macro-region 
+#' and a sub-region, the sub-region's data is calculated by summing the population weight of the 
+#' macro region and the record for the region. 
+#'
+#' @param m The input dataset containing countries and associated values.
+#' @param country_column The column name in the dataset `m` representing the countries. Defaults to "country".
+#' @param value_column The column name in the dataset `m` representing the values associated with each country. Defaults to "value".
+#' @param duplicate A logical value. If TRUE, the values will not be split between new regions, e.g., when calculating sustainability scores. Defaults to FALSE.
+#' 
+#' @details The function is built to recognize common macro-region names and their corresponding OHI regions. 
+#' It's imperative for users to be aware that this function might require updates if new macro-regions 
+#' or changes to OHI regions occur in the future.
+#'
+#' Population data is used to weight the values for each newly split region. This data must be provided
+#' in the `split_pops` data frame (external to this function). If `duplicate` is set to TRUE, values are 
+#' not divided among regions but duplicated instead.
+#'
+#' @return A dataset with macro-regions split into individual OHI regions.
+#' 
+#' @examples 
+#' # This assumes existence of a dataset similar in structure to expected input and `split_pops`
+#' # updated_data <- region_split(original_data)
+#'
+#' @keywords ohi, macro-region, split
+#' @export
+
+split_regions <- function(m, country_column = "country", value_column = "value", duplicate = FALSE) {
+
+  # List of macro-regions to break down
+  split_details <- list(
+    `Netherlands Antilles` = c("Bonaire", "Sint Eustatius", "Saba", "Curaçao", "Sint Maarten", "Aruba"),
+    `Bonaire/S.Eustatius/Saba` = c("Bonaire", "Sint Eustatius", "Saba"),
+    `Saint Helena/Asc./Trist.` = c("Tristan da Cunha", "Saint Helena", "Ascension"),
+    `Channel Islands` = c("Guernsey", "Jersey"),
+    `United States Minor Outlying Island` = c("Wake Island", "Jarvis Island", "Palmyra Atoll", "Howland Island and Baker Island", "Johnston Atoll"),
+    `French Southern Territories` = c("Glorioso Islands", "Juan de Nova Island", "Bassas da India", "Ile Europa", "Ile Tromelin", "Crozet Islands", "Amsterdam Island and Saint Paul Island", "Kerguelen Islands"),
+    `Bonaire, Sint Eustatius and Saba` = c("Bonaire", "Sint Eustatius", "Saba"),
+    `French Southern Terr` = c("Glorioso Islands", "Juan de Nova Island", "Bassas da India", "Ile Europa", "Ile Tromelin", "Crozet Islands", "Amsterdam Island and Saint Paul Island", "Kerguelen Islands"),
+    `United States Minor Outlying Islands` = c("Wake Island", "Jarvis Island", "Palmyra Atoll", "Howland Island and Baker Island", "Johnston Atoll"),
+    `Saint Helena, Ascension and Tristan da Cunha` = c("Saint Helena", "Ascension", "Tristan da Cunha"),
+    `Caribbean Netherlands` = c("Bonaire", "Sint Eustatius", "Saba"),
+    `Channel Isl. (UK)` = c("Jersey", "Guernsey"),
+    `Saba and Sint Eustatius (Netherlands)` = c("Saba", "Sint Eustatius"),
+    `Mozambique Channel Isl. (France)` = c("Juan de Nova Island", "Bassas da India", "Ile Europa")
+  )
+
+  # Rename data frame columns
+  m <- m %>%
+    rename(!!paste0(country_column) := country, !!paste0(value_column) := value)
+
+  # Load population weighting data
+  population <- split_pops
+
+  # Loop over all macro region names within the country column
+  for (country_name in names(split_details)) {
+
+    # Check to see if macro country name is present
+    if (country_name %in% m$country) {
+
+      # Pull the regions associated with the macro country name
+      regions <- split_details[[country_name]]
+
+      # Calculate total population for the regions
+      pop_sum <- sum(population$population[population$country %in% regions])
+
+      # Calculate area weights for the regions
+      area_weights <- population %>%
+        filter(country %in% regions) %>%
+        mutate(weight = ifelse(duplicate, 1, population / pop_sum)) %>% # Conditionally set weight
+        select(country, weight) %>%
+        mutate(id = row_number()) 
+
+      # Split the data into different regions
+      m_new <- m %>%
+        filter(country == country_name) %>%
+        uncount(length(regions), .id = "id") %>%
+        left_join(area_weights, by = c("id" = "id")) %>%
+        mutate(value = value * weight) %>%
+        mutate(country = country.y) %>%
+        select(-country.x, -country.y, -id, -weight)
+
+      # Update m to remove the original country and add broken down rows
+      m <- m %>%
+        filter(!(country %in% country_name)) %>%
+        rbind(m_new)
+
+      # Sum duplicate rows if duplicate is False
+      if (!duplicate) {
+        grouping_vars <- setdiff(names(m), "value")
+
+        m <- m %>%
+          group_by(across(all_of(grouping_vars))) %>%
+          summarize(value = case_when(all(is.na(value)) ~ NA,
+                                      TRUE ~ sum(value, na.rm = TRUE))) %>%
+          ungroup()
+
+        # Remove duplicates if duplicate is True
+      } else {
+
+        # Remove duplicates
+        m <- m[!duplicated(m), ]
+
+      } # End duplicate if statement
+
+    } # End country if statement
+
+  } # End for loop
+
+  # Rename columns to match data frame input
+  m <- m %>%
+    rename(!!country_column := country, !!value_column := value)
+
+  # Return the new data frame
+  return(m)
+
+} # End function
diff --git a/data-raw/data_prepare.R b/data-raw/data_prepare.R
diff --git a/data/georegion_labels.rda b/data/georegion_labels.rda
diff --git a/data/georegions.rda b/data/georegions.rda
diff --git a/data/rgn_id_names.rda b/data/rgn_id_names.rda
diff --git a/data/rgn_master.rda b/data/rgn_master.rda
diff --git a/data/rgn_synonyms.rda b/data/rgn_synonyms.rda
diff --git a/data/sovregion_labels.rda b/data/sovregion_labels.rda
diff --git a/data/sovregions.rda b/data/sovregions.rda
diff --git a/data/split_pops.rda b/data/split_pops.rda
diff --git a/data_raw/data_prepare.R b/data_raw/data_prepare.R
@@ -0,0 +1,31 @@
+### Prepare data files for R
+### if data needs to be updated, do this in the source files and then resave in ohicore
+### eventually this will have its own package ('rohiprep')
+
+library(devtools)
+library(tidyverse)
+
+rgn_synonyms <- read_csv('data_raw/rgn_eez_v2013a_synonyms.csv')
+usethis::use_data(rgn_synonyms, overwrite = TRUE)
+
+rgn_master <- read_csv('data_raw/eez_rgn_2013main.csv')
+usethis::use_data(rgn_master, overwrite = TRUE)
+
+## The following have not been updated, will do this when I figure out how they are used.
+
+georegion_labels <- read.csv('data_raw/georegion_labels.csv')
+usethis::use_data(georegion_labels, overwrite = TRUE)
+
+
+georegions <- read.csv('data_raw/georegions.csv')
+usethis::use_data(georegions, overwrite = TRUE)
+
+sovregion_labels <- read.csv('data_raw/sovregion_labels.csv')
+usethis::use_data(sovregion_labels, overwrite = TRUE)
+
+sovregions <- read.csv('data_raw/sovregions.csv')
+usethis::use_data(sovregions, overwrite = TRUE)
+
+split_pops <- read_csv('data_raw/split_pops.csv')
+usethis::use_data(split_pops, overwrite = TRUE)
+