Skip to content

Commit

Permalink
Merge pull request #202 from OHI-Science/dev
Browse files Browse the repository at this point in the history
merging into master
  • Loading branch information
gclawson1 committed Nov 27, 2023
2 parents 3bb30b7 + 15de3af commit 7aba6a5
Show file tree
Hide file tree
Showing 37 changed files with 1,199 additions and 60 deletions.
7 changes: 5 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ Imports:
tidyr,
RColorBrewer,
htmlwidgets,
here
here,
rlang
Suggests:
knitr,
rmarkdown
Expand All @@ -40,6 +41,7 @@ Collate:
'CheckLayers.R'
'Conf.R'
'Layers.R'
'layers_eez_base_updater.R'
'PlotFlower.R'
'ScoreScaling.R'
'SelectLayersData.R'
Expand All @@ -56,9 +58,10 @@ Collate:
'rgn_eez_area.R'
'sovregion_labels.R'
'sovregions.R'
'split_regions.R'
URL: https://github.com/OHI-Science/ohicore
BugReports: https://github.com/OHI-Science/ohicore/issues
LazyData: TRUE
VignetteBuilder: knitr
RoxygenNote: 6.1.1
RoxygenNote: 7.2.3
Encoding: UTF-8
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@ export(SelectLayersData)
export(WriteRefPoint)
export(collapse_2_rgn)
export(compare_scores_df)
export(layers_eez_base_updater)
export(mapvalues)
export(name_2_rgn)
export(read_git_csv)
export(score.clamp)
export(score.max)
export(score.rescale)
export(score_check)
export(split_regions)
export(trace_git_csv_value)
exportClasses(Conf)
exportClasses(Layers)
Expand Down
118 changes: 118 additions & 0 deletions R/layers_eez_base_updater.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#' Update layers_eez_base.csv
#' By Peter Menzies
#'
#' Convenient interface for changing directory paths of recently updated layers in layers_eez_base.csv
#' No arguments required - user will be prompted for needed information
#'
#'
#' @keywords ohi
#' @export


layers_eez_base_updater <- function() {

require(here)
require(tidyverse)

# read in the csv
layers_eez_base <- read_csv(here("metadata_documentation/layers_eez_base.csv"), col_types = cols())

# prompt user for current version year (gsub() to remove 'v' if user adds it as well)
message("")
version_year <- paste0("v", gsub("\\D", "", readline(prompt = "enter version year: ")))

# empty vector that will later contain layer names - used in while loop
possible_layers <- c()

# loop that ends when a viable goal/subgoal abbr is supplied
while (length(possible_layers) == 0) {

message("")
goal <- readline(prompt = "enter the goal/subgoal/prs/res abbreviation for the layers you're updating (e.g. 'np', 'hab', or 'cc'): ") %>%
tolower()

possible_layers <- layers_eez_base$layer[startsWith(layers_eez_base$layer, goal)]

if (length(possible_layers) == 0) {
message("\nthere are no layers starting with that abbreviation\n")
}
}

# the component layers of that goal/subgoal are printed below for user's convenience
message("\nthese are the layers associated with that abbreviation:\n")
print(possible_layers)
message("\nif you want to update all of these layers, enter 'all' at the next prompt —")
message("if you are only updating certain ones you can copypaste the layer names above separated by commas\n")


# prompt user for layers which have been updated
updated_layers <- str_split(readline(prompt = "enter 'all' or layers separated only by commas: "), ",")[[1]] %>%
str_remove_all(" ")

if (tolower(updated_layers[1]) == "all") {
updated_layers <- possible_layers
}


# loop that executes or repeats if entries don't match any of the layer names in the goal / subgoal
while (length(intersect(updated_layers, possible_layers)) < length(updated_layers)) {

unknown_layers <- setdiff(updated_layers, possible_layers)

if (length(unknown_layers) == length(updated_layers)) {
message("\nnone of the layers entered coincide with the chosen abbreviation\n")
} else if (length(unknown_layers) == 1) {
message(paste0("\nthe following entry does not coincide with the chosen abbreviation: \n"))
print(unknown_layers)
message("")
} else {
message("\nthe following entries do not coincide with the chosen abbreviation: \n")
print(unknown_layers)
message("")
}

updated_layers <- str_split(readline(prompt = "enter 'all' or layers separated only by commas: "), ",")[[1]] %>%
str_remove_all(" ")

if (tolower(updated_layers[1]) == "all") {
updated_layers <- possible_layers
}
}

# create df with updated file paths
layers_eez_base_updated <- layers_eez_base %>%
mutate(dir = case_when(layer %in% updated_layers ~
gsub("v20\\d\\d", version_year, dir),
TRUE ~ dir))

# vector of dir names which have changed from the original csv
updated_dirs <- anti_join(layers_eez_base_updated, layers_eez_base, by = c("layer", "dir")) %>%
rename("dir (UPDATED)" = dir)

# make sure the chosen year and layers elicited changes
if (nrow(updated_dirs) != 0) {
message("\nthe selected 'dir' values will be updated as shown in the data viewer ↑ ")
message("do you want to update layers_eez_base.csv with these changes?\n")
View(updated_dirs)

# request for permission to overwrite current csv with version containing updated dirs
overwrite <- readline(prompt = "update? ('y' or 'n'): ")

if (overwrite == "y") {
write.csv(layers_eez_base_updated, here("metadata_documentation/layers_eez_base.csv"),
row.names = FALSE)
message("\nfile has been updated\n")

} else {
message("\nfile was *NOT* updated\n")
}

# if no changes were elicited to any file paths, end function with message
} else {
message("\nthe chosen file paths already contain that version year - no updates were made\n")
}

}



21 changes: 17 additions & 4 deletions R/name_2_rgn.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ name_2_rgn <- function(df_in, #df_in=empd
dplyr::ungroup()

### attach rgn_synonyms; summarize eliminates duplicate rows (same tmp_name
### and rgn_id) - rgn type not critical?
### and rgn_id)
syns <- rgn_synonyms %>%
dplyr::select(rgn_id = rgn_id_2013, tmp_name = rgn_nam_2013,
tmp_type = rgn_typ)
Expand All @@ -41,10 +41,23 @@ name_2_rgn <- function(df_in, #df_in=empd
### create a temp field in the target data frame, for the field that is being combined.
df_in['tmp_name'] <- df_in[fld_name]

### replace problematic symbols (accents and such) within target data frame.
### replace problematic symbols within target data frame.
df_in <- df_in %>%
dplyr::mutate(tmp_name = stringr::str_trim(tmp_name),
tmp_name = stringr::str_replace(tmp_name, "^'", ""))
tmp_name = stringr::str_replace(tmp_name, "^'", "")) %>%
mutate(tmp_name = stringr::str_remove(tmp_name, ",")) %>%
mutate(tmp_name = stringr::str_remove(tmp_name, "'")) %>%
mutate(tmp_name = stringr::str_remove(tmp_name, "´")) %>%
mutate(tmp_name = stringr::str_remove(tmp_name, "")) %>%
mutate(tmp_name = tolower(tmp_name))

#turn all of the names to lowercase and remove commas and apostrophes
rgn_syn <- rgn_syn %>%
mutate(tmp_name = tolower(tmp_name)) %>%
mutate(tmp_name = stringr::str_remove(tmp_name, ",")) %>%
mutate(tmp_name = stringr::str_remove(tmp_name, "'")) %>%
mutate(tmp_name = stringr::str_remove(tmp_name, "´")) %>%
mutate(tmp_name = stringr::str_remove(tmp_name, ""))

### combine target data frame with region name data frame;
### filter to ones with 'eez' or 'ohi_region' in the type.
Expand Down Expand Up @@ -110,7 +123,7 @@ name_2_rgn <- function(df_in, #df_in=empd
duplicated(dups_data, fromLast = TRUE)

if(sum(i_dupes) > 0) {
message(sprintf("\nDUPLICATES found. Consider using collapse2rgn to collapse duplicates (function in progress).\n"))
message(sprintf("\nDUPLICATES found. Confirm your script consolidates these as appropriate for your data.\n"))
df_out_dupes <- unique(df_out[i_dupes, fld_name])
print(df_out_dupes)
}
Expand Down
120 changes: 120 additions & 0 deletions R/split_regions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#' Split Macro-Regions to OHI Regions
#'
#' This function takes datasets containing country records that include multiple OHI regions
#' (macro-regions) and breaks them down into individual OHI regions. It uses population-weighted
#' values to distribute data across the new regions. When a dataset contains both a macro-region
#' and a sub-region, the sub-region's data is calculated by summing the population weight of the
#' macro region and the record for the region.
#'
#' @param m The input dataset containing countries and associated values.
#' @param country_column The column name in the dataset `m` representing the countries. Defaults to "country".
#' @param value_column The column name in the dataset `m` representing the values associated with each country. Defaults to "value".
#' @param duplicate A logical value. If TRUE, the values will not be split between new regions, e.g., when calculating sustainability scores. Defaults to FALSE.
#'
#' @details The function is built to recognize common macro-region names and their corresponding OHI regions.
#' It's imperative for users to be aware that this function might require updates if new macro-regions
#' or changes to OHI regions occur in the future.
#'
#' Population data is used to weight the values for each newly split region. This data must be provided
#' in the `split_pops` data frame (external to this function). If `duplicate` is set to TRUE, values are
#' not divided among regions but duplicated instead.
#'
#' @return A dataset with macro-regions split into individual OHI regions.
#'
#' @examples
#' # This assumes existence of a dataset similar in structure to expected input and `split_pops`
#' # updated_data <- region_split(original_data)
#'
#' @keywords ohi, macro-region, split
#' @export

split_regions <- function(m, country_column = "country", value_column = "value", duplicate = FALSE) {

# List of macro-regions to break down
split_details <- list(
`Netherlands Antilles` = c("Bonaire", "Sint Eustatius", "Saba", "Curaçao", "Sint Maarten", "Aruba"),
`Bonaire/S.Eustatius/Saba` = c("Bonaire", "Sint Eustatius", "Saba"),
`Saint Helena/Asc./Trist.` = c("Tristan da Cunha", "Saint Helena", "Ascension"),
`Channel Islands` = c("Guernsey", "Jersey"),
`United States Minor Outlying Island` = c("Wake Island", "Jarvis Island", "Palmyra Atoll", "Howland Island and Baker Island", "Johnston Atoll"),
`French Southern Territories` = c("Glorioso Islands", "Juan de Nova Island", "Bassas da India", "Ile Europa", "Ile Tromelin", "Crozet Islands", "Amsterdam Island and Saint Paul Island", "Kerguelen Islands"),
`Bonaire, Sint Eustatius and Saba` = c("Bonaire", "Sint Eustatius", "Saba"),
`French Southern Terr` = c("Glorioso Islands", "Juan de Nova Island", "Bassas da India", "Ile Europa", "Ile Tromelin", "Crozet Islands", "Amsterdam Island and Saint Paul Island", "Kerguelen Islands"),
`United States Minor Outlying Islands` = c("Wake Island", "Jarvis Island", "Palmyra Atoll", "Howland Island and Baker Island", "Johnston Atoll"),
`Saint Helena, Ascension and Tristan da Cunha` = c("Saint Helena", "Ascension", "Tristan da Cunha"),
`Caribbean Netherlands` = c("Bonaire", "Sint Eustatius", "Saba"),
`Channel Isl. (UK)` = c("Jersey", "Guernsey"),
`Saba and Sint Eustatius (Netherlands)` = c("Saba", "Sint Eustatius"),
`Mozambique Channel Isl. (France)` = c("Juan de Nova Island", "Bassas da India", "Ile Europa")
)

# Rename data frame columns
m <- m %>%
rename(!!paste0(country_column) := country, !!paste0(value_column) := value)

# Load population weighting data
population <- split_pops

# Loop over all macro region names within the country column
for (country_name in names(split_details)) {

# Check to see if macro country name is present
if (country_name %in% m$country) {

# Pull the regions associated with the macro country name
regions <- split_details[[country_name]]

# Calculate total population for the regions
pop_sum <- sum(population$population[population$country %in% regions])

# Calculate area weights for the regions
area_weights <- population %>%
filter(country %in% regions) %>%
mutate(weight = ifelse(duplicate, 1, population / pop_sum)) %>% # Conditionally set weight
select(country, weight) %>%
mutate(id = row_number())

# Split the data into different regions
m_new <- m %>%
filter(country == country_name) %>%
uncount(length(regions), .id = "id") %>%
left_join(area_weights, by = c("id" = "id")) %>%
mutate(value = value * weight) %>%
mutate(country = country.y) %>%
select(-country.x, -country.y, -id, -weight)

# Update m to remove the original country and add broken down rows
m <- m %>%
filter(!(country %in% country_name)) %>%
rbind(m_new)

# Sum duplicate rows if duplicate is False
if (!duplicate) {
grouping_vars <- setdiff(names(m), "value")

m <- m %>%
group_by(across(all_of(grouping_vars))) %>%
summarize(value = case_when(all(is.na(value)) ~ NA,
TRUE ~ sum(value, na.rm = TRUE))) %>%
ungroup()

# Remove duplicates if duplicate is True
} else {

# Remove duplicates
m <- m[!duplicated(m), ]

} # End duplicate if statement

} # End country if statement

} # End for loop

# Rename columns to match data frame input
m <- m %>%
rename(!!country_column := country, !!value_column := value)

# Return the new data frame
return(m)

} # End function
29 changes: 0 additions & 29 deletions data-raw/data_prepare.R

This file was deleted.

Binary file modified data/georegion_labels.rda
Binary file not shown.
Binary file modified data/georegions.rda
Binary file not shown.
Binary file added data/rgn_id_names.rda
Binary file not shown.
Binary file modified data/rgn_master.rda
Binary file not shown.
Binary file modified data/rgn_synonyms.rda
Binary file not shown.
Binary file modified data/sovregion_labels.rda
Binary file not shown.
Binary file modified data/sovregions.rda
Binary file not shown.
Binary file added data/split_pops.rda
Binary file not shown.
31 changes: 31 additions & 0 deletions data_raw/data_prepare.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
### Prepare data files for R
### if data needs to be updated, do this in the source files and then resave in ohicore
### eventually this will have its own package ('rohiprep')

library(devtools)
library(tidyverse)

rgn_synonyms <- read_csv('data_raw/rgn_eez_v2013a_synonyms.csv')
usethis::use_data(rgn_synonyms, overwrite = TRUE)

rgn_master <- read_csv('data_raw/eez_rgn_2013main.csv')
usethis::use_data(rgn_master, overwrite = TRUE)

## The following have not been updated, will do this when I figure out how they are used.

georegion_labels <- read.csv('data_raw/georegion_labels.csv')
usethis::use_data(georegion_labels, overwrite = TRUE)


georegions <- read.csv('data_raw/georegions.csv')
usethis::use_data(georegions, overwrite = TRUE)

sovregion_labels <- read.csv('data_raw/sovregion_labels.csv')
usethis::use_data(sovregion_labels, overwrite = TRUE)

sovregions <- read.csv('data_raw/sovregions.csv')
usethis::use_data(sovregions, overwrite = TRUE)

split_pops <- read_csv('data_raw/split_pops.csv')
usethis::use_data(split_pops, overwrite = TRUE)

0 comments on commit 7aba6a5

Please sign in to comment.